import com.aliasi.util.Files; import com.aliasi.classify.BinaryLMClassifier; import com.aliasi.classify.Classification; import com.aliasi.classify.Classifier; import com.aliasi.classify.ClassifierEvaluator; import com.aliasi.classify.DynamicLMClassifier; import com.aliasi.classify.NaiveBayesClassifier; import com.aliasi.lm.NGramProcessLM; import com.aliasi.tokenizer.RegExTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.util.AbstractExternalizable; import com.aliasi.util.ObjectToCounterMap; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class PolarityWhole { File mPolarityDir; List[] mPosReviewLists; List[] mNegReviewLists; ObjectToCounterMap mTokenCount = new ObjectToCounterMap(); public static void main(String[] args) { try { new PolarityWhole(args).run(); } catch (Throwable t) { System.out.println("Thrown: " + t); t.printStackTrace(System.out); } } PolarityWhole(String[] args) { mPolarityDir = new File(args[0]); } void run() throws ClassNotFoundException, IOException { System.out.println("POLARITY DEMO"); System.out.println(" Data Directory=" + mPolarityDir); readData(); evaluate(new NGramFactory(8)); evaluate(new NaiveBayesFactory(5)); } void evaluate(DynamicLMClassifierFactory factory) throws ClassNotFoundException, IOException { ClassifierEvaluator evaluator = new ClassifierEvaluator(null,CATEGORIES); for (int i = 0; i < NUM_FOLDS; ++i) evaluate(i,factory,evaluator); } void evaluate(int fold, DynamicLMClassifierFactory factory, ClassifierEvaluator evaluator) throws ClassNotFoundException, IOException { System.out.println(" Evaluating fold=" + fold); DynamicLMClassifier classifier = factory.create(); System.out.print(" Training. Fold="); for (int i = 0; i < NUM_FOLDS; ++i) { if (i != fold) { System.out.print(" " + i); train(i,classifier); } } System.out.println(); System.out.println(" Compiling."); factory.tweak(classifier); Classifier compiledClassifier = (Classifier) AbstractExternalizable.compile(classifier); System.out.println(" Testing."); test(fold,compiledClassifier,evaluator); System.out.println("EVALUATION"); System.out.println("CLASSIFIER=" + factory.toString()); System.out.println(evaluator.toString()); } void train(int fold, DynamicLMClassifier classifier) { train(mPosReviewLists[fold],POSITIVE,classifier); train(mNegReviewLists[fold],NEGATIVE,classifier); } void train(List reviewList, String category, DynamicLMClassifier classifier) { Iterator it = reviewList.iterator(); while (it.hasNext()) { String text = it.next().toString(); classifier.train(category,text); } } void test(int fold, Classifier classifier, ClassifierEvaluator evaluator) { test(mPosReviewLists[fold],POSITIVE,classifier,evaluator); test(mNegReviewLists[fold],NEGATIVE,classifier,evaluator); } void test(List reviewList, String category, Classifier classifier, ClassifierEvaluator evaluator) { Iterator it = reviewList.iterator(); while (it.hasNext()) { String review = it.next().toString(); Classification classification = classifier.classify(review); evaluator.addClassification(category,classification); } } void readData() throws IOException { mPosReviewLists = readData(new File(mPolarityDir,"pos")); mNegReviewLists = readData(new File(mPolarityDir,"neg")); Object[] keys = mTokenCount.keysOrderedByCount(); System.out.println(" #Tokens=" + keys.length); for (int i = 0; i < keys.length && i < 200; ++i) System.out.println(" " + keys[i] + "=" + mTokenCount.getCount(keys[i])); } List[] readData(File dir) throws IOException { List[] foldReviewLists = new ArrayList[NUM_FOLDS]; for (int i = 0; i < NUM_FOLDS; ++i) foldReviewLists[i] = new ArrayList(); File[] files = dir.listFiles(); for (int i = 0; i < files.length; ++i) { String review = Files.readFromFile(files[i]); addTokens(review); int k = fileToFold(files[i]); foldReviewLists[k].add(review); } return foldReviewLists; } void addTokens(String review) { String[] tokens = review.split("\\s+"); for (int i = 0; i < tokens.length; ++i) { mTokenCount.increment(tokens[i]); } } int fileToFold(File file) { String name = file.getName(); char foldChar = name.charAt(2); if (foldChar == '0') return 0; else return (foldChar - '1') + 1; } static final int NUM_FOLDS = 10; static final String POSITIVE = BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY; static final String NEGATIVE = BinaryLMClassifier.DEFAULT_REJECT_CATEGORY; static final String[] CATEGORIES = new String[] { BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY, BinaryLMClassifier.DEFAULT_REJECT_CATEGORY }; static abstract class DynamicLMClassifierFactory { abstract DynamicLMClassifier create(); void tweak(DynamicLMClassifier classifier) { } } class NGramFactory extends DynamicLMClassifierFactory { int mMaxNGram; public NGramFactory(int maxNGram) { mMaxNGram = maxNGram; } public DynamicLMClassifier create() { return DynamicLMClassifier .createNGramProcess(CATEGORIES,mMaxNGram); } public void tweak(DynamicLMClassifier classifier) { NGramProcessLM lmPos = (NGramProcessLM) classifier.lmForCategory(POSITIVE); NGramProcessLM lmNeg = (NGramProcessLM) classifier.lmForCategory(NEGATIVE); Object[] keys = mTokenCount.keysOrderedByCount(); for (int i = 0; i < keys.length; ++i) { String token = keys[i].toString(); lmPos.train(token); lmNeg.train(token); } // lmPos.substringCounter().prune(2); // lmNeg.substringCounter().prune(2); } public String toString() { return mMaxNGram + "-gram Character LM Classifier"; } } class NaiveBayesFactory extends DynamicLMClassifierFactory { int mMaxNGram; public NaiveBayesFactory(int nGram) { mMaxNGram = nGram; } public DynamicLMClassifier create() { return new NaiveBayesClassifier(CATEGORIES, new SpaceTokenizerFactory(), mMaxNGram); } public String toString() { return "Naive Bayes with " + mMaxNGram + "-gram Char Smoothing"; } } public static class SpaceTokenizerFactory extends RegExTokenizerFactory { public SpaceTokenizerFactory() { super("\\S+"); } } }