/** * The number of classes. For binary classification it is 2. */ int numClasses;
/** * The number of instances. */ int numInstances;
/** * The number of conditional attributes. */ int numConditions;
/** * The prediction, including queried and predicted labels. */ int[] predicts;
/** * Class distribution. */ double[] classDistribution;
/** * Class distribution with Laplacian smooth. */ double[] classDistributionLaplacian;
/** * To calculate the conditional probabilities for all classes over all * attributes on all values. */ double[][][] conditionalCounts;
/** * The conditional probabilities with Laplacian smooth. */ double[][][] conditionalProbabilitiesLaplacian;
/** * Data type. */ int dataType;
/** * Nominal. */ publicstaticfinalintNOMINAL=0;
/** * ******************* * The constructor. * * @param paraFilename The given file. * ******************* */ publicNaiveBayesForNominal(String paraFilename) { dataset = null; try { FileReaderfileReader=newFileReader(paraFilename); dataset = newInstances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee); System.exit(0); } // Of try
dataset.setClassIndex(dataset.numAttributes() - 1); numConditions = dataset.numAttributes() - 1; numInstances = dataset.numInstances(); numClasses = dataset.attribute(numConditions).numValues(); }// Of the constructor
/** * ******************* * Set the data type. * ******************* */ publicvoidsetDataType(int paraDataType) { dataType = paraDataType; }// Of setDataType
/** * ******************* * Calculate the class distribution with Laplacian smooth. * ******************* */ publicvoidcalculateClassDistribution() { classDistribution = newdouble[numClasses]; classDistributionLaplacian = newdouble[numClasses];
double[] tempCounts = newdouble[numClasses]; for (inti=0; i < numInstances; i++) { inttempClassValue= (int) dataset.instance(i).classValue(); tempCounts[tempClassValue]++; } // Of for i
for (inti=0; i < numClasses; i++) { classDistribution[i] = tempCounts[i] / numInstances; classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses); } // Of for i
System.out.println("Class distribution: " + Arrays.toString(classDistribution)); System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian)); }// Of calculateClassDistribution
/** * ******************* * Calculate the conditional probabilities with Laplacian smooth. ONLY scan * the dataset once. There was a simpler one, I have removed it because the * time complexity is higher. * ******************* */ publicvoidcalculateConditionalProbabilities() { conditionalCounts = newdouble[numClasses][numConditions][]; conditionalProbabilitiesLaplacian = newdouble[numClasses][numConditions][];
// Allocate space for (inti=0; i < numClasses; i++) { for (intj=0; j < numConditions; j++) { inttempNumValues= dataset.attribute(j).numValues(); conditionalCounts[i][j] = newdouble[tempNumValues]; conditionalProbabilitiesLaplacian[i][j] = newdouble[tempNumValues]; } // Of for j } // Of for i
// Count the numbers int[] tempClassCounts = newint[numClasses]; for (inti=0; i < numInstances; i++) { inttempClass= (int) dataset.instance(i).classValue(); tempClassCounts[tempClass]++; for (intj=0; j < numConditions; j++) { inttempValue= (int) dataset.instance(i).value(j); conditionalCounts[tempClass][j][tempValue]++; } // Of for j } // Of for i
// Now for the real probability with Laplacian for (inti=0; i < numClasses; i++) { for (intj=0; j < numConditions; j++) { inttempNumValues= dataset.attribute(j).numValues(); for (intk=0; k < tempNumValues; k++) { conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1) / (tempClassCounts[i] + tempNumValues); } // Of for k } // Of for j } // Of for i
System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalCounts)); }// Of calculateConditionalProbabilities
/** * ******************* * Classify all instances, the results are stored in predicts[]. * ******************* */ publicvoidclassify() { predicts = newint[numInstances]; for (inti=0; i < numInstances; i++) { predicts[i] = classify(dataset.instance(i)); } // Of for i }// Of classify
/** * ******************* * Classify an instances. * ******************* */ publicintclassify(Instance paraInstance) { if (dataType == NOMINAL) { return classifyNominal(paraInstance); } return -1; }// Of classify
/** * ******************* * Classify an instances with nominal data. * ******************* */ publicintclassifyNominal(Instance paraInstance) { // Find the biggest one doubletempBiggest= -10000; intresultBestIndex=0; for (inti=0; i < numClasses; i++) { doubletempClassProbabilityLaplacian= Math.log(classDistributionLaplacian[i]); doubletempPseudoProbability= tempClassProbabilityLaplacian; for (intj=0; j < numConditions; j++) { inttempAttributeValue= (int) paraInstance.value(j);
// Laplacian smooth. tempPseudoProbability += Math.log(conditionalProbabilitiesLaplacian[i][j][tempAttributeValue]); } // Of for j
if (tempBiggest < tempPseudoProbability) { tempBiggest = tempPseudoProbability; resultBestIndex = i; } // Of if } // Of for i
return resultBestIndex; }// Of classifyNominal
/** * ******************* * Compute accuracy. * ******************* */ publicdoublecomputeAccuracy() { doubletempCorrect=0; for (inti=0; i < numInstances; i++) { if (predicts[i] == (int) dataset.instance(i).classValue()) { tempCorrect++; } // Of if } // Of for i
return tempCorrect / numInstances; }// Of computeAccuracy
/** * ************************ * Test nominal data. * ************************ */ publicstaticvoidtestNominal() { System.out.println("Hello, Naive Bayes. I only want to test the nominal data."); StringtempFilename="D:/Work/sampledata/mushroom.arff";
System.out.println("The accuracy is: " + tempLearner.computeAccuracy()); }// Of testNominal
/** * ************************ * Test this class. * * @param args Not used now. * ************************ */ publicstaticvoidmain(String[] args) { testNominal(); }// Of main } // Of class NaiveBayesForNominal