Loading src/main/java/edu/bu/LanguageCorrection/Checker.java +2 −1 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ public class Checker { Map<String, Float> phraseScores = new HashMap<>(); for (String sentence : sentences) { sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", ""); //System.out.println("Analyzing sentence: " + sentence); List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); Loading @@ -27,7 +28,7 @@ public class Checker { for (String phrase : phrases) { // System.out.println("Analyzing phrase: " + phrase); float perplexity = detector.perplexity(phrase); if (perplexity > 100) { if (perplexity < 0) { phraseScores.put(phrase, 100f); } else { phraseScores.put(phrase, perplexity); Loading src/main/java/edu/bu/LanguageCorrection/Corrector.java +6 −2 Original line number Diff line number Diff line Loading @@ -90,8 +90,12 @@ public class Corrector { try { String content = new String(Files.readAllBytes(Paths.get(path))); Corrector corrector = new Corrector(); // Run corrector String corrected = corrector.correct(content); String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]); for (String sentence : sentences) { sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", ""); String corrected = corrector.correct(sentence); System.out.println(corrected); } } catch (IOException e) { System.err.println("Error reading file: " + e.getMessage()); } Loading src/main/java/edu/bu/LanguageCorrection/TrieNode.java +18 −42 Original line number Diff line number Diff line Loading @@ -22,66 +22,42 @@ public class TrieNode implements Serializable, Cloneable { } public float probability(String phrase) { String[] words = phrase.split(" "); if (words.length <= 1) { // If word does not exist in trie // System.out.println("Probability of phrase: " + 1 / this.childCounts); return (float) 0.1; } TrieNode current = this; TrieNode past = this; for (String word : phrase.split(" ")) { System.out.println("Phrase: " + phrase); for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 1; // System.out.println("Phrase not found in trie."); return 0; return alpha * probability(phrase.substring(phrase.indexOf(" ") + 1)); } } // System.out.println("Probability of phrase: " + (double) current.count / // past.childCounts); // System.out.println("Probability of phrase: " + (float) current.count / past.childCounts); return (float) current.count / past.childCounts; } private float getAverageChildCount() { if (this.children.size() == 0) { return 1; } return (float) this.childCounts / this.children.size(); private float getAverageChildCount(TrieNode node) { return (float) node.childCounts / node.children.size(); } public float perplexity(String phrase) { TrieNode current = this; TrieNode past = this; float logProb = 0; String[] words = phrase.split(" "); if (words.length == 1) { return (float) 100 / words.length; } String currentPhrase = ""; for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 100 / words.length; return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), words.length); } logProb += Math.log((float) current.count / past.getAverageChildCount()); } float perplexity = (float) Math.pow(2, -logProb); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); return perplexity; } private float perplexity(String phrase, int wordCount) { TrieNode current = this; TrieNode past = this; float logProb = 0; String[] words = phrase.split(" "); if (words.length == 1) { return (float) 100 / wordCount; } for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 100 / wordCount; return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), wordCount); if (currentPhrase.length() == 0) { currentPhrase = word; } else { currentPhrase += " " + word; } logProb += Math.log((float) current.count / past.getAverageChildCount()); logProb += Math.log((probability(currentPhrase))); } float perplexity = (float) Math.pow(2, -logProb); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); Loading Loading
src/main/java/edu/bu/LanguageCorrection/Checker.java +2 −1 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ public class Checker { Map<String, Float> phraseScores = new HashMap<>(); for (String sentence : sentences) { sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", ""); //System.out.println("Analyzing sentence: " + sentence); List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); Loading @@ -27,7 +28,7 @@ public class Checker { for (String phrase : phrases) { // System.out.println("Analyzing phrase: " + phrase); float perplexity = detector.perplexity(phrase); if (perplexity > 100) { if (perplexity < 0) { phraseScores.put(phrase, 100f); } else { phraseScores.put(phrase, perplexity); Loading
src/main/java/edu/bu/LanguageCorrection/Corrector.java +6 −2 Original line number Diff line number Diff line Loading @@ -90,8 +90,12 @@ public class Corrector { try { String content = new String(Files.readAllBytes(Paths.get(path))); Corrector corrector = new Corrector(); // Run corrector String corrected = corrector.correct(content); String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]); for (String sentence : sentences) { sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", ""); String corrected = corrector.correct(sentence); System.out.println(corrected); } } catch (IOException e) { System.err.println("Error reading file: " + e.getMessage()); } Loading
src/main/java/edu/bu/LanguageCorrection/TrieNode.java +18 −42 Original line number Diff line number Diff line Loading @@ -22,66 +22,42 @@ public class TrieNode implements Serializable, Cloneable { } public float probability(String phrase) { String[] words = phrase.split(" "); if (words.length <= 1) { // If word does not exist in trie // System.out.println("Probability of phrase: " + 1 / this.childCounts); return (float) 0.1; } TrieNode current = this; TrieNode past = this; for (String word : phrase.split(" ")) { System.out.println("Phrase: " + phrase); for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 1; // System.out.println("Phrase not found in trie."); return 0; return alpha * probability(phrase.substring(phrase.indexOf(" ") + 1)); } } // System.out.println("Probability of phrase: " + (double) current.count / // past.childCounts); // System.out.println("Probability of phrase: " + (float) current.count / past.childCounts); return (float) current.count / past.childCounts; } private float getAverageChildCount() { if (this.children.size() == 0) { return 1; } return (float) this.childCounts / this.children.size(); private float getAverageChildCount(TrieNode node) { return (float) node.childCounts / node.children.size(); } public float perplexity(String phrase) { TrieNode current = this; TrieNode past = this; float logProb = 0; String[] words = phrase.split(" "); if (words.length == 1) { return (float) 100 / words.length; } String currentPhrase = ""; for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 100 / words.length; return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), words.length); } logProb += Math.log((float) current.count / past.getAverageChildCount()); } float perplexity = (float) Math.pow(2, -logProb); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); return perplexity; } private float perplexity(String phrase, int wordCount) { TrieNode current = this; TrieNode past = this; float logProb = 0; String[] words = phrase.split(" "); if (words.length == 1) { return (float) 100 / wordCount; } for (String word : words) { past = current; current = current.children.get(word); if (current == null) { float alpha = (float) 100 / wordCount; return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), wordCount); if (currentPhrase.length() == 0) { currentPhrase = word; } else { currentPhrase += " " + word; } logProb += Math.log((float) current.count / past.getAverageChildCount()); logProb += Math.log((probability(currentPhrase))); } float perplexity = (float) Math.pow(2, -logProb); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); Loading