import com.aliasi.classify.PrecisionRecallEvaluation; import com.aliasi.cluster.HierarchicalClusterer; import com.aliasi.cluster.ClusterScore; import com.aliasi.cluster.CompleteLinkClusterer; import com.aliasi.cluster.SingleLinkClusterer; import com.aliasi.cluster.Dendrogram; import com.aliasi.util.Counter; import com.aliasi.util.Distance; import com.aliasi.util.Files; import com.aliasi.util.ObjectToCounterMap; import com.aliasi.util.Strings; import com.aliasi.tokenizer.EnglishStopListFilterTokenizer; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.LowerCaseFilterTokenizer; import com.aliasi.tokenizer.PorterStemmerFilterTokenizer; import com.aliasi.tokenizer.Tokenizer; import java.io.*; import java.util.*; public class TokenCosineDocCluster { public static void main(String[] args) throws Exception { File dir = new File(args[0]); Set> referencePartition = new HashSet>(); for (File catDir : dir.listFiles()) { System.out.println("Category from file=" + catDir); Set docsForCat = new HashSet(); referencePartition.add(docsForCat); for (File file : catDir.listFiles()) { Document doc = new Document(file); docsForCat.add(doc); } } Set docSet = new HashSet(); for (Set cluster : referencePartition) docSet.addAll(cluster); // eval clusterers HierarchicalClusterer clClusterer = new CompleteLinkClusterer(COSINE_DISTANCE); Dendrogram completeLinkDendrogram = clClusterer.hierarchicalCluster(docSet); HierarchicalClusterer slClusterer = new SingleLinkClusterer(COSINE_DISTANCE); Dendrogram singleLinkDendrogram = slClusterer.hierarchicalCluster(docSet); System.out.println(); System.out.println(" --------------------------------------------------------"); System.out.println("| K | Complete | Single | Cross |"); System.out.println("| | P R F | P R F | P R F |"); System.out.println(" --------------------------------------------------------"); for (int k = 1; k <= docSet.size(); ++k) { Set> clResponsePartition = completeLinkDendrogram.partitionK(k); Set> slResponsePartition = singleLinkDendrogram.partitionK(k); ClusterScore scoreCL = new ClusterScore(referencePartition, clResponsePartition); PrecisionRecallEvaluation clPrEval = scoreCL.equivalenceEvaluation(); ClusterScore scoreSL = new ClusterScore(referencePartition, slResponsePartition); PrecisionRecallEvaluation slPrEval = scoreSL.equivalenceEvaluation(); ClusterScore scoreX = new ClusterScore(clResponsePartition, slResponsePartition); PrecisionRecallEvaluation xPrEval = scoreX.equivalenceEvaluation(); System.out.printf("| %3d | %3.2f %3.2f %3.2f | %3.2f %3.2f %3.2f | %3.2f %3.2f %3.2f |\n", k, clPrEval.precision(), clPrEval.recall(), clPrEval.fMeasure(), slPrEval.precision(), slPrEval.recall(), slPrEval.fMeasure(), xPrEval.precision(), xPrEval.recall(), xPrEval.fMeasure() ); } System.out.println(" --------------------------------------------------------"); } static class Document { final File mFile; final char[] mText; // don't really need to store final ObjectToCounterMap mTokenCounter = new ObjectToCounterMap(); final double mLength; Document(File file) throws IOException { mFile = file; // includes name mText = Files.readCharsFromFile(file,Strings.UTF8); Tokenizer tokenizer = createTokenizer(mText); String token; while ((token = tokenizer.nextToken()) != null) mTokenCounter.increment(token.toLowerCase()); mLength = length(mTokenCounter); } double cosine(Document thatDoc) { return product(thatDoc) / (mLength * thatDoc.mLength); } double product(Document thatDoc) { double sum = 0.0; for (String token : mTokenCounter.keySet()) { int count = thatDoc.mTokenCounter.getCount(token); if (count == 0) continue; // tf = sqrt(count); sum += tf1 * tf2 sum += Math.sqrt(count * mTokenCounter.getCount(token)); } return sum; } public String toString() { return mFile.getParentFile().getName() + "/" + mFile.getName(); } static double length(ObjectToCounterMap otc) { double sum = 0.0; for (Counter counter : otc.values()) { double count = counter.doubleValue(); sum += count; // tf =sqrt(count); sum += tf * tf } return Math.sqrt(sum); } static Tokenizer createTokenizer(char[] cs) { Tokenizer tokenizer = IndoEuropeanTokenizerFactory .FACTORY.tokenizer(cs,0,cs.length); // tokenizer = new LowerCaseFilterTokenizer(tokenizer); // tokenizer = new EnglishStopListFilterTokenizer(tokenizer); // tokenizer = new PorterStemmerFilterTokenizer(tokenizer); return tokenizer; } } static final Distance COSINE_DISTANCE = new Distance() { public double distance(Document doc1, Document doc2) { double oneMinusCosine = 1.0 - doc1.cosine(doc2); if (oneMinusCosine > 1.0) return 1.0; else if (oneMinusCosine < 0.0) return 0.0; else return oneMinusCosine; } }; }