这里的 \(\mu\) 和 \(\sigma\) 表示均值和标准差. a
表示具体对应的某一个类别下标. b 表示实例中某个属性的下标. m
表示类别总数, n 表示属性总数. \(x_b\)
表示一个实例. 求出的值为\(\log{P^L (D_a) } +
\sum_{b=1}^{n} (-\log\sigma_{ab}) +
(-\frac{(x_b-\mu_{ab})^2}{2\sigma_{ab}^2})\)式子最大时 a
的值.
publicclassNaiveBayesForNumerical { /** ************************* * An inner class to store parameters. ************************* */ privatestaticclassGaussianParameters { double mu; double sigma;
publicGaussianParameters(double paraMu, double paraSigma) { mu = paraMu; sigma = paraSigma; }// Of the constructor
public String toString() { return"(" + mu + ", " + sigma + ")"; }// Of toString }// Of GaussianParameters
/** * The data. */ Instances dataset;
/** * The number of classes. For binary classification it is 2. */ int numClasses;
/** * The number of instances. */ int numInstances;
/** * The number of conditional attributes. */ int numConditions;
/** * The prediction, including queried and predicted labels. */ int[] predicts;
/** * Class distribution. */ double[] classDistribution;
/** * Class distribution with Laplacian smooth. */ double[] classDistributionLaplacian;
/** * The Gaussian parameters. */ GaussianParameters[][] gaussianParameters;
/** ******************** * The constructor. * * @param paraFilename The given file. ******************** */ publicNaiveBayesForNumerical(String paraFilename) { dataset = null; try { FileReaderfileReader=newFileReader(paraFilename); dataset = newInstances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee); System.exit(0); } // Of try
dataset.setClassIndex(dataset.numAttributes() - 1); numConditions = dataset.numAttributes() - 1; numInstances = dataset.numInstances(); numClasses = dataset.attribute(numConditions).numValues(); }// Of the constructor
/** ******************** * Set the data type. ******************** */ publicvoidsetDataType(int paraDataType) { dataType = paraDataType; }// Of setDataType
/** ******************** * Calculate the class distribution with Laplacian smooth. ******************** */ publicvoidcalculateClassDistribution() { classDistribution = newdouble[numClasses]; classDistributionLaplacian = newdouble[numClasses];
double[] tempCounts = newdouble[numClasses]; for (inti=0; i < numInstances; i++) { inttempClassValue= (int) dataset.instance(i).classValue(); tempCounts[tempClassValue]++; } // Of for i
for (inti=0; i < numClasses; i++) { classDistribution[i] = tempCounts[i] / numInstances; classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses); } // Of for i
System.out.println("Class distribution: " + Arrays.toString(classDistribution)); System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian)); }// Of calculateClassDistribution
/** ******************** * Calculate the conditional probabilities with Laplacian smooth. ******************** */ publicvoidcalculateGaussianParameters() { gaussianParameters = newGaussianParameters[numClasses][numConditions];
double[] tempValuesArray = newdouble[numInstances]; int tempNumValues; double tempSum;
for (inti=0; i < numClasses; i++) { for (intj=0; j < numConditions; j++) { tempSum = 0;
// Obtain values for this class. tempNumValues = 0; for (intk=0; k < numInstances; k++) { if ((int) dataset.instance(k).classValue() != i) { continue; } // Of if
tempValuesArray[tempNumValues] = dataset.instance(k).value(j); tempSum += tempValuesArray[tempNumValues]; tempNumValues++; } // Of for k
doubletempSigma=0; for (intk=0; k < tempNumValues; k++) { tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu); } // Of for k tempSigma /= tempNumValues; tempSigma = Math.sqrt(tempSigma);
gaussianParameters[i][j] = newGaussianParameters(tempMu, tempSigma); } // Of for j } // Of for i
System.out.println(Arrays.deepToString(gaussianParameters)); }// Of calculateGaussianParameters
/** ******************** * Classify all instances, the results are stored in predicts[]. ******************** */ publicvoidclassify() { predicts = newint[numInstances]; for (inti=0; i < numInstances; i++) { predicts[i] = classify(dataset.instance(i)); } // Of for i }// Of classify
/** ******************** * Classify an instances. ******************** */ publicintclassify(Instance paraInstance) { if (dataType == NUMERICAL) { return classifyNumerical(paraInstance); } // Of if
return -1; }// Of classify
/** ******************** * Classify an instances with numerical data. ******************** */ publicintclassifyNumerical(Instance paraInstance) { // Find the biggest one doubletempBiggest= -10000; intresultBestIndex=0;
for (inti=0; i < numClasses; i++) { doubletempPseudoProbability= Math.log(classDistributionLaplacian[i]); for (intj=0; j < numConditions; j++) { doubletempAttributeValue= paraInstance.value(j); doubletempSigma= gaussianParameters[i][j].sigma; doubletempMu= gaussianParameters[i][j].mu;
if (tempBiggest < tempPseudoProbability) { tempBiggest = tempPseudoProbability; resultBestIndex = i; } // Of if } // Of for i
return resultBestIndex; }// Of classifyNumerical
/** ******************** * Compute accuracy. ******************** */ publicdoublecomputeAccuracy() { doubletempCorrect=0; for (inti=0; i < numInstances; i++) { if (predicts[i] == (int) dataset.instance(i).classValue()) { tempCorrect++; } // Of if } // Of for i
return tempCorrect / numInstances; }// Of computeAccuracy
/** ************************* * Test numerical data. ************************* */ publicstaticvoidtestNumerical() { System.out.println("Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption."); StringtempFilename="D:/Work/sampledata/iris-imbalance.arff";
System.out.println("The accuracy is: " + tempLearner.computeAccuracy()); }// Of testNominal
/** ************************* * Test this class. * * @param args Not used now. ************************* */ publicstaticvoidmain(String[] args) { testNumerical(); }// Of main } // Of class NaiveBayesForNumerical