/** * Sorted indices, where the first element indicates the instance with the * biggest density. */ int[] descendantDensities;
/** * Priority */ double[] priority;
/** * The maximal distance between any pair of points. */ double maximalDistance;
/** * Who is my master? */ int[] masters;
/** * Predicted labels. */ int[] predictedLabels;
/** * Instance status. 0 for unprocessed, 1 for queried, 2 for classified. */ int[] instanceStatusArray;
/** * The descendant indices to show the representativeness of instances in a * descendant order. */ int[] descendantRepresentatives;
/** * Indicate the cluster of each instance. It is only used in * clusterInTwo(int[]); */ int[] clusterIndices;
/** * Blocks with size no more than this threshold should not be split further. */ intsmallBlockThreshold=3;
/** * ********************************* * The constructor. * * @param paraFilename The data filename. * ********************************* */ publicAlec(String paraFilename) { try { FileReadertempReader=newFileReader(paraFilename); dataset = newInstances(tempReader); dataset.setClassIndex(dataset.numAttributes() - 1); tempReader.close(); } catch (Exception ee) { System.out.println("Alec Error" + ee); System.exit(0); } // Of fry computeMaximalDistance(); clusterIndices = newint[dataset.numInstances()]; }// Of the constructor
/** * ********************************* * Merge sort in descendant order to obtain an index array. The original * array is unchanged. The method should be tested further. <br> * Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2]. <br> * input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3]. * * @param paraArray the original array * @return The sorted indices. * ********************************* */ publicstaticint[] mergeSortToIndices(double[] paraArray) { inttempLength= paraArray.length; int[][] resultMatrix = newint[2][tempLength];// For merge sort.
// Initialize inttempIndex=0; for (inti=0; i < tempLength; i++) { resultMatrix[tempIndex][i] = i; } // Of for i
// Merge inttempCurrentLength=1; // The indices for current merged groups. int tempFirstStart, tempSecondStart, tempSecondEnd;
while (tempCurrentLength < tempLength) { // Divide into a number of groups. // Here the boundary is adaptive to array length not equal to 2^k. for (inti=0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength / 2); i++) { // Boundaries of the group tempFirstStart = i * tempCurrentLength * 2;
if (paraArray[resultMatrix[tempIndex % 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex % 2][tempSecondIndex]]) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempFirstIndex]; tempFirstIndex++; } else { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempSecondIndex]; tempSecondIndex++; } // Of if tempCurrentIndex++; } // Of while
// Remaining part for (intj= tempFirstIndex; j < tempSecondStart; j++) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j]; tempCurrentIndex++; } // Of for j for (intj= tempSecondIndex; j <= tempSecondEnd; j++) { resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j]; tempCurrentIndex++; } // Of for j } // Of for i
tempCurrentLength *= 2; tempIndex++; } // Of while
return resultMatrix[tempIndex % 2]; }// Of mergeSortToIndices
/** * ******************** * The Euclidean distance between two instances. Other distance measures * unsupported for simplicity. * * @param paraI The index of the first instance. * @param paraJ The index of the second instance. * @return The distance. * ******************** */ publicdoubledistance(int paraI, int paraJ) { doubleresultDistance=0; double tempDifference; for (inti=0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i); resultDistance += tempDifference * tempDifference; } // Of for i resultDistance = Math.sqrt(resultDistance);
return resultDistance; }// Of distance
/** * ********************************* * Compute the maximal distance. The result is stored in a member variable. * ********************************* */ publicvoidcomputeMaximalDistance() { maximalDistance = 0; double tempDistance; for (inti=0; i < dataset.numInstances(); i++) { for (intj=0; j < dataset.numInstances(); j++) { tempDistance = distance(i, j); if (maximalDistance < tempDistance) { maximalDistance = tempDistance; } // Of if } // Of for j } // Of for i
System.out.println("maximalDistance = " + maximalDistance); }// Of computeMaximalDistance
for (inti=0; i < dataset.numInstances(); i++) { for (intj=0; j < dataset.numInstances(); j++) { tempDistance = distance(i, j); densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius); } // Of for j } // Of for i
System.out.println("The densities are " + Arrays.toString(densities) + "\r\n"); }// Of computeDensitiesGaussian
/** * ********************************* * Compute distanceToMaster, the distance to its master. * ********************************* */ publicvoidcomputeDistanceToMaster() { distanceToMaster = newdouble[dataset.numInstances()]; masters = newint[dataset.numInstances()]; descendantDensities = newint[dataset.numInstances()]; instanceStatusArray = newint[dataset.numInstances()];
double tempDistance; for (inti=1; i < dataset.numInstances(); i++) { // Initialize. distanceToMaster[descendantDensities[i]] = maximalDistance; for (intj=0; j <= i - 1; j++) { tempDistance = distance(descendantDensities[i], descendantDensities[j]); if (distanceToMaster[descendantDensities[i]] > tempDistance) { distanceToMaster[descendantDensities[i]] = tempDistance; masters[descendantDensities[i]] = descendantDensities[j]; } // Of if } // Of for j } // Of for i System.out.println("First compute, masters = " + Arrays.toString(masters)); System.out.println("descendantDensities = " + Arrays.toString(descendantDensities)); }// Of computeDistanceToMaster
/** * ********************************* * Compute priority. Element with higher priority is more likely to be * selected as a cluster center. Now it is rho * distanceToMaster. It can * also be rho^alpha * distanceToMaster. * ********************************* */ publicvoidcomputePriority() { priority = newdouble[dataset.numInstances()]; for (inti=0; i < dataset.numInstances(); i++) { priority[i] = densities[i] * distanceToMaster[i]; } // Of for i }// Of computePriority
/** * ************************ * The block of a node should be same as its master. This recursive method * is efficient. * * @param paraIndex The index of the given node. * @return The cluster index of the current node. * ************************ */ publicintcoincideWithMaster(int paraIndex) { if (clusterIndices[paraIndex] == -1) { inttempMaster= masters[paraIndex]; clusterIndices[paraIndex] = coincideWithMaster(tempMaster); } // Of if
return clusterIndices[paraIndex]; }// Of coincideWithMaster
/** * ************************ * Cluster a block in two. According to the master tree. * * @param paraBlock The given block. * @return The new blocks where the two most represent instances serve as the root. * ************************ */ publicint[][] clusterInTwo(int[] paraBlock) { // Reinitialize. In fact, only instances in the given block is // considered. Arrays.fill(clusterIndices, -1);
// Initialize the cluster number of the two roots. for (inti=0; i < 2; i++) { clusterIndices[paraBlock[i]] = i; } // Of for i
for (int j : paraBlock) { if (clusterIndices[j] != -1) { // Already have a cluster number. continue; } // Of if
clusterIndices[j] = coincideWithMaster(masters[j]); } // Of for i
// The sub blocks. int[][] resultBlocks = newint[2][]; inttempFistBlockCount=0; for (int clusterIndex : clusterIndices) { if (clusterIndex == 0) { tempFistBlockCount++; } // Of if } // Of for i resultBlocks[0] = newint[tempFistBlockCount]; resultBlocks[1] = newint[paraBlock.length - tempFistBlockCount];
// Copy. You can design shorter code when the number of clusters is // greater than 2. inttempFirstIndex=0; inttempSecondIndex=0; for (int j : paraBlock) { if (clusterIndices[j] == 0) { resultBlocks[0][tempFirstIndex] = j; tempFirstIndex++; } else { resultBlocks[1][tempSecondIndex] = j; tempSecondIndex++; } // Of if } // Of for i
/** * ********************************* * Classify instances in the block by simple voting. * * @param paraBlock The given block. * ********************************* */ publicvoidvote(int[] paraBlock) { int[] tempClassCounts = newint[dataset.numClasses()]; for (int j : paraBlock) { if (instanceStatusArray[j] == 1) { tempClassCounts[(int) dataset.instance(j).classValue()]++; } // Of if } // Of for i
inttempMaxClass= -1; inttempMaxCount= -1; for (inti=0; i < tempClassCounts.length; i++) { if (tempMaxCount < tempClassCounts[i]) { tempMaxClass = i; tempMaxCount = tempClassCounts[i]; } // Of if } // Of for i
// Classify unprocessed instances. for (int j : paraBlock) { if (instanceStatusArray[j] == 0) { predictedLabels[j] = tempMaxClass; instanceStatusArray[j] = 2; } // Of if } // Of for i }// Of vote
/** * ********************************* * Cluster based active learning. Prepare for * * @param paraRatio The ratio of the maximal distance as the dc. * @param paraMaxNumQuery The maximal number of queries for the whole dataset. * @param paraSmallBlockThreshold The small block threshold. * ********************************* */ publicvoidclusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) { radius = maximalDistance * paraRatio; smallBlockThreshold = paraSmallBlockThreshold;
numQuery = 0; clusterBasedActiveLearning(descendantRepresentatives); }// Of clusterBasedActiveLearning
/** * ********************************* * Cluster based active learning. * * @param paraBlock The given block. This block must be sorted according to the priority in descendant order. * ********************************* */ publicvoidclusterBasedActiveLearning(int[] paraBlock) { System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// Step 1. How many labels are queried for this block. inttempExpectedQueries= (int) Math.sqrt(paraBlock.length); inttempNumQuery=0; for (int j : paraBlock) { if (instanceStatusArray[j] == 1) { tempNumQuery++; } // Of if } // Of for i
// Step 2. Vote for small blocks. if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) { System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock)); vote(paraBlock);
return; } // Of if
// Step 3. Query enough labels. for (inti=0; i < tempExpectedQueries; i++) { if (numQuery >= maxNumQuery) { System.out.println("No more queries are provided, numQuery = " + numQuery + "."); vote(paraBlock); return; } // Of if
if (instanceStatusArray[paraBlock[i]] == 0) { instanceStatusArray[paraBlock[i]] = 1; predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue(); numQuery++; } // Of if } // Of for i
// Step 4. Pure? inttempFirstLabel= predictedLabels[paraBlock[0]]; booleantempPure=true; for (inti=1; i < tempExpectedQueries; i++) { if (predictedLabels[paraBlock[i]] != tempFirstLabel) { tempPure = false; break; } // Of if } // Of for i if (tempPure) { System.out.println("Classify for pure block: " + Arrays.toString(paraBlock)); for (inti= tempExpectedQueries; i < paraBlock.length; i++) { if (instanceStatusArray[paraBlock[i]] == 0) { predictedLabels[paraBlock[i]] = tempFirstLabel; instanceStatusArray[paraBlock[i]] = 2; } // Of if } // Of for i return; } // Of if
// Step 5. Split in two and process them independently. int[][] tempBlocks = clusterInTwo(paraBlock); for (inti=0; i < 2; i++) { // Attention: recursive invoking here. clusterBasedActiveLearning(tempBlocks[i]); } // Of for i }// Of clusterBasedActiveLearning
/** * ****************** * Show the statistics information. * ****************** */ public String toString() { int[] tempStatusCounts = newint[3]; doubletempCorrect=0; for (inti=0; i < dataset.numInstances(); i++) { tempStatusCounts[instanceStatusArray[i]]++; if (predictedLabels[i] == (int) dataset.instance(i).classValue()) { tempCorrect++; } // Of if } // Of for i
/** * ********************************* * The entrance of the program. * * @param args: Not used now. * ********************************* */ publicstaticvoidmain(String[] args) { longtempStart= System.currentTimeMillis();