/** * The distance measure. */ publicintdistanceMeasure= EUCLIDEAN;
/** * A random instance; */ publicstaticfinalRandomrandom=newRandom();
/** * The data. */ Instances dataset;
/** * The number of clusters. */ intnumClusters=2;
/** * The clusters. */ int[][] clusters;
/** * ****************************** * The first constructor. * * @param paraFilename The data filename. * ****************************** */ publicKMeans(String paraFilename) { dataset = null; try { FileReaderfileReader=newFileReader(paraFilename); dataset = newInstances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee); System.exit(0); } // Of try }// Of the first constructor
/** * ****************************** * A setter. * ****************************** */ publicvoidsetNumClusters(int paraNumClusters) { numClusters = paraNumClusters; }// Of the setter
/** * ******************** * Get a random indices for data randomization. * * @param paraLength The length of the sequence. * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6. * ******************** */ publicstaticint[] getRandomIndices(int paraLength) { int[] resultIndices = newint[paraLength];
// Step 1. Initialize. for (inti=0; i < paraLength; i++) { resultIndices[i] = i; } // Of for i
// Step 2. Randomly swap. int tempFirst, tempSecond, tempValue; for (inti=0; i < paraLength; i++) { // Generate two random indices. tempFirst = random.nextInt(paraLength); tempSecond = random.nextInt(paraLength);
// Swap. tempValue = resultIndices[tempFirst]; resultIndices[tempFirst] = resultIndices[tempSecond]; resultIndices[tempSecond] = tempValue; } // Of for i
return resultIndices; }// Of getRandomIndices
/** * ******************** * The distance between two instances. * * @param paraI The index of the first instance. * @param paraArray The array representing a point in the space. * @return The distance. * ******************** */ publicdoubledistance(int paraI, double[] paraArray) { intresultDistance=0; double tempDifference; switch (distanceMeasure) { case MANHATTAN: for (inti=0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - paraArray[i]; if (tempDifference < 0) { resultDistance -= tempDifference; } else { resultDistance += tempDifference; } // Of if } // Of for i break;
case EUCLIDEAN: for (inti=0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - paraArray[i]; resultDistance += tempDifference * tempDifference; } // Of for i break; default: System.out.println("Unsupported distance measure: " + distanceMeasure); }// Of switch
// Step 1. Initialize centers. int[] tempRandomOrders = getRandomIndices(dataset.numInstances()); for (inti=0; i < numClusters; i++) { for (intj=0; j < tempCenters[0].length; j++) { tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j); } // Of for j } // Of for i
// Step 2.1 Minimization. Assign cluster to each instance. int tempNearestCenter; double tempNearestDistance; double tempDistance;
for (inti=0; i < dataset.numInstances(); i++) { tempNearestCenter = -1; tempNearestDistance = Double.MAX_VALUE;
for (intj=0; j < numClusters; j++) { tempDistance = distance(i, tempCenters[j]); if (tempNearestDistance > tempDistance) { tempNearestDistance = tempDistance; tempNearestCenter = j; } // Of if } // Of for j tempClusterArray[i] = tempNearestCenter; } // Of for i
// Step 2.2 Mean. Find new centers. tempClusterLengths = newint[numClusters]; Arrays.fill(tempClusterLengths, 0); double[][] tempNewCenters = newdouble[numClusters][dataset.numAttributes() - 1]; // Arrays.fill(tempNewCenters, 0); for (inti=0; i < dataset.numInstances(); i++) { for (intj=0; j < tempNewCenters[0].length; j++) { tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j); } // Of for j tempClusterLengths[tempClusterArray[i]]++; } // Of for i
// Step 2.3 Now average for (inti=0; i < tempNewCenters.length; i++) { for (intj=0; j < tempNewCenters[0].length; j++) { tempNewCenters[i][j] /= tempClusterLengths[i]; } // Of for j } // Of for i
System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters)); tempCenters = tempNewCenters; } // Of while
// Step 3. Form clusters. clusters = newint[numClusters][]; int[] tempCounters = newint[numClusters]; for (inti=0; i < numClusters; i++) { if (tempClusterLengths != null) { clusters[i] = newint[tempClusterLengths[i]]; } } // Of for i
for (inti=0; i < tempClusterArray.length; i++) { clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i; tempCounters[tempClusterArray[i]]++; } // Of for i
System.out.println("The clusters are: \r\n");
for (inti=0; i < clusters.length; i++) { System.out.print("clusters " + i + ": "); for (intj=0; j < clusters[i].length; j++) { System.out.print( clusters[i][j] + " "); } System.out.println(); } }// Of clustering
/** * ************************ * A testing method. * ************************ */ publicstaticvoidmain(String[] args) { testClustering(); }// Of main } // Of class KMeans