Java学习-Day30

数值型数据的 NB 算法

一、数据处理

1. 发现问题

不同于之前符号型的数据, 数值型的数据在理论和实际上来说它的取值点是无穷的. 我们只能通过限定一段区间来说明概率.

如设考试成绩为 Score 则及格的概率为 \(P(60 \le Score)\) 是可以的, 但是如果说刚好及格的概率 \(P(Score = 60)\) 这个式子就为 0.

所以我们这里所做的工作就是把之前式子中的概率 \(P\) 换为概率密度函数 \(p\).

2. 处理

在第 29 天的时候得到了如下的这样一个式子.

\[ P(D|\mathrm{x}) = \frac{P(\mathrm{x}D)}{P(\mathrm{x})} = \frac{P(D) \prod_{i=1}^{m}P(x_i|D) }{P(\mathrm{x})} \tag{1} \]

这里需要做两件事:

  • 根据数据及分布假设, 求得概率密度函数 \(p(x)\)

\[ p(x) = \frac{1}{\sqrt{2\pi} \sigma } exp\left (-\frac{(x-\mu)^2}{2\sigma^2} \right ) \tag{2} \]

  • 直接用 \(p(x)\) 替换掉式 (1) 中的 \(P(x_i|D)\), 常用的 \(p(x)\) 为正态高斯分布. 替换后对其进行求 log 的操作, 以防数值溢出. 如下所示得到我们最后需要求的表达式.

\[ d(\mathrm{x}) = \underset{1 \le a \le m}{\mathrm{argmax}}\ \log{P^L(D_a)} + \sum_{b=1}^{n}(-\log\sigma_{ab}) + (-\frac{(x_b-\mu_{ab})^2}{2\sigma_{ab}^2}) \tag{3} \]

这里的 \(\mu\)\(\sigma\) 表示均值和标准差. a 表示具体对应的某一个类别下标. b 表示实例中某个属性的下标. m 表示类别总数, n 表示属性总数. \(x_b\) 表示一个实例. 求出的值为\(\log{P^L (D_a) } + \sum_{b=1}^{n} (-\log\sigma_{ab}) + (-\frac{(x_b-\mu_{ab})^2}{2\sigma_{ab}^2})\)式子最大时 a 的值.

二、算法实现

1. 具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
package bayes;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.*;

/**
* The Naive Bayes algorithm.
*
* @author Shi-Huai Wen Email: shihuaiwen@outlook.com.
*/

public class NaiveBayesForNumerical {
/**
*************************
* An inner class to store parameters.
*************************
*/
private static class GaussianParameters {
double mu;
double sigma;

public GaussianParameters(double paraMu, double paraSigma) {
mu = paraMu;
sigma = paraSigma;
}// Of the constructor

public String toString() {
return "(" + mu + ", " + sigma + ")";
}// Of toString
}// Of GaussianParameters

/**
* The data.
*/
Instances dataset;

/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;

/**
* The number of instances.
*/
int numInstances;

/**
* The number of conditional attributes.
*/
int numConditions;

/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;

/**
* Class distribution.
*/
double[] classDistribution;

/**
* Class distribution with Laplacian smooth.
*/
double[] classDistributionLaplacian;

/**
* The Gaussian parameters.
*/
GaussianParameters[][] gaussianParameters;

/**
* Data type.
*/
int dataType;

/**
* Numerical.
*/
public static final int NUMERICAL = 1;

/**
********************
* The constructor.
*
* @param paraFilename The given file.
********************
*/
public NaiveBayesForNumerical(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
} // Of try

dataset.setClassIndex(dataset.numAttributes() - 1);
numConditions = dataset.numAttributes() - 1;
numInstances = dataset.numInstances();
numClasses = dataset.attribute(numConditions).numValues();
}// Of the constructor

/**
********************
* Set the data type.
********************
*/
public void setDataType(int paraDataType) {
dataType = paraDataType;
}// Of setDataType

/**
********************
* Calculate the class distribution with Laplacian smooth.
********************
*/
public void calculateClassDistribution() {
classDistribution = new double[numClasses];
classDistributionLaplacian = new double[numClasses];

double[] tempCounts = new double[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClassValue = (int) dataset.instance(i).classValue();
tempCounts[tempClassValue]++;
} // Of for i

for (int i = 0; i < numClasses; i++) {
classDistribution[i] = tempCounts[i] / numInstances;
classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
} // Of for i

System.out.println("Class distribution: " + Arrays.toString(classDistribution));
System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
}// Of calculateClassDistribution

/**
********************
* Calculate the conditional probabilities with Laplacian smooth.
********************
*/
public void calculateGaussianParameters() {
gaussianParameters = new GaussianParameters[numClasses][numConditions];

double[] tempValuesArray = new double[numInstances];
int tempNumValues;
double tempSum;

for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
tempSum = 0;

// Obtain values for this class.
tempNumValues = 0;
for (int k = 0; k < numInstances; k++) {
if ((int) dataset.instance(k).classValue() != i) {
continue;
} // Of if

tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
tempSum += tempValuesArray[tempNumValues];
tempNumValues++;
} // Of for k

// Obtain parameters.
double tempMu = tempSum / tempNumValues;

double tempSigma = 0;
for (int k = 0; k < tempNumValues; k++) {
tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
} // Of for k
tempSigma /= tempNumValues;
tempSigma = Math.sqrt(tempSigma);

gaussianParameters[i][j] = new GaussianParameters(tempMu, tempSigma);
} // Of for j
} // Of for i

System.out.println(Arrays.deepToString(gaussianParameters));
}// Of calculateGaussianParameters

/**
********************
* Classify all instances, the results are stored in predicts[].
********************
*/
public void classify() {
predicts = new int[numInstances];
for (int i = 0; i < numInstances; i++) {
predicts[i] = classify(dataset.instance(i));
} // Of for i
}// Of classify

/**
********************
* Classify an instances.
********************
*/
public int classify(Instance paraInstance) {
if (dataType == NUMERICAL) {
return classifyNumerical(paraInstance);
} // Of if

return -1;
}// Of classify


/**
********************
* Classify an instances with numerical data.
********************
*/
public int classifyNumerical(Instance paraInstance) {
// Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;

for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
double tempAttributeValue = paraInstance.value(j);
double tempSigma = gaussianParameters[i][j].sigma;
double tempMu = gaussianParameters[i][j].mu;

tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu)
* (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
} // Of for j

if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
} // Of if
} // Of for i

return resultBestIndex;
}// Of classifyNumerical

/**
********************
* Compute accuracy.
********************
*/
public double computeAccuracy() {
double tempCorrect = 0;
for (int i = 0; i < numInstances; i++) {
if (predicts[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i

return tempCorrect / numInstances;
}// Of computeAccuracy

/**
*************************
* Test numerical data.
*************************
*/
public static void testNumerical() {
System.out.println("Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
String tempFilename = "D:/Work/sampledata/iris-imbalance.arff";

NaiveBayesForNumerical tempLearner = new NaiveBayesForNumerical(tempFilename);
tempLearner.setDataType(NUMERICAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateGaussianParameters();
tempLearner.classify();

System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}// Of testNominal

/**
*************************
* Test this class.
*
* @param args Not used now.
*************************
*/
public static void main(String[] args) {
testNumerical();
}// Of main
} // Of class NaiveBayesForNumerical

2. 运行截图

总结

  1. 数值型数据相比之前符号性数据处理就只有一个概率变概率密度的过程.

  2. 假设所有属性的属性值都服从高斯分布. 也可以做其它假设. 在大多数工具包中, 例如 Python 对这样的处理也是默认的高斯分布.

  3. 将概率密度当成概率值直接使用 Bayes 公式. 和这个处理方式类似的还有就是核密度估计.