Loading checker_test_file.txt 0 → 100644 +2 −0 Original line number Diff line number Diff line Sorry for the terrible inconvenience but this site is still under super development. be right back. Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development. No newline at end of file src/main/java/edu/bu/LanguageCorrection/AnomalyDetector.javadeleted 100644 → 0 +0 −66 Original line number Diff line number Diff line package edu.bu.LanguageCorrection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class AnomalyDetector { private static final Set<String> commonWords = new HashSet<>(); private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later static { // simple list of common words for demonstration purposes commonWords.add("the"); commonWords.add("be"); commonWords.add("to"); commonWords.add("of"); commonWords.add("and"); // we can add more later } public Map<String, Integer> analyzeSentences(List<String> sentences) { Map<String, Integer> sentenceScores = new HashMap<>(); for (String sentence : sentences) { int score = 0; // length variance score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length // word rarity String[] words = sentence.split("\\s+"); for (String word : words) { if (!commonWords.contains(word.toLowerCase())) { score += 10; // Increment score for each UNCOMMON word } } sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize) } return sentenceScores; } public Map<String, Integer> analyzePhrases(List<String> sentences) { Map<String, Integer> phraseScores = new HashMap<>(); for (String sentence : sentences) { List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); for (String phrase : phrases) { int score = 0; // Score length variance like above if (phrase.length() < AVERAGE_WORD_LENGTH || phrase.length() > AVERAGE_WORD_LENGTH * 3) { score += 20; } // Word rarity String[] words = phrase.split("\\s+"); for (String word : words) { if (!commonWords.contains(word.toLowerCase())) { score += 5; } } phraseScores.put(phrase, Math.min(score, 100)); // Normalize } } return phraseScores; } } src/main/java/edu/bu/LanguageCorrection/Checker.java +68 −7 Original line number Diff line number Diff line Loading @@ -4,17 +4,45 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.ArrayList; import java.util.zip.Inflater; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import edu.bu.LanguageCorrection.AnomalyDetector; import edu.bu.LanguageCorrection.TextProcessor; public class Checker { public void analyze(String text) { List<String> sentences = TextProcessor.extractSentences(text); AnomalyDetector detector = new AnomalyDetector(); Map<String, Integer> sentenceScores = detector.analyzeSentences(sentences); Map<String, Integer> phraseScores = detector.analyzePhrases(sentences); TrieNode detector = loadFile("metadata.ser"); Map<String, Float> sentenceScores = new HashMap<>(); Map<String, Float> phraseScores = new HashMap<>(); for (String sentence : sentences) { //System.out.println("Analyzing sentence: " + sentence); List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); // Calculate perplexity (score) for each phrase for (String phrase : phrases) { // System.out.println("Analyzing phrase: " + phrase); float perplexity = detector.perplexity(phrase); if (perplexity > 100) { phraseScores.put(phrase, 100f); } else { phraseScores.put(phrase, perplexity); } } // Calculate average perplexity for the sentence float sentenceScore = 0; for (String phrase : phrases) { sentenceScore += phraseScores.get(phrase); } sentenceScore /= phrases.size(); sentenceScores.put(sentence, sentenceScore); } // Output results in JSON format System.out.println("{"); Loading @@ -22,9 +50,9 @@ public class Checker { System.out.println("\"phrases\": " + mapToJson(phraseScores)); System.out.println("}"); } private static String mapToJson(Map<String, Integer> map) { private static String mapToJson(Map<String, Float> map) { StringBuilder jsonBuilder = new StringBuilder("{"); for (Map.Entry<String, Integer> entry : map.entrySet()) { for (Map.Entry<String, Float> entry : map.entrySet()) { jsonBuilder.append("\"" + entry.getKey() + "\": " + entry.getValue() + ","); } jsonBuilder.deleteCharAt(jsonBuilder.length() - 1); // remove last comma Loading @@ -32,6 +60,39 @@ public class Checker { return jsonBuilder.toString(); } private static byte[] decompress(byte[] compressedData) { Inflater decompressor = new Inflater(); decompressor.setInput(compressedData); ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length); byte[] buf = new byte[1024]; try { while (!decompressor.finished()) { int count = decompressor.inflate(buf); bos.write(buf, 0, count); } decompressor.end(); return bos.toByteArray(); } catch (Exception e) { System.err.println("Error decompressing data: " + e.getMessage()); return new byte[0]; } } private TrieNode loadFile(String filePath) { TrieNode trie = new TrieNode(); try (FileInputStream fis = new FileInputStream(filePath)) { byte[] compressedData = fis.readAllBytes(); byte[] decompressedData = decompress(compressedData); trie.deserialize(decompressedData); System.out.println("Metadata loaded successfully."); return trie; } catch (IOException e) { System.err.println("Error reading metadata from file: " + e.getMessage()); return new TrieNode(); } } public static void main(String[] args) { if (args.length > 1 && "--file".equals(args[0])) { // check syntax String path = args[1]; Loading src/main/java/edu/bu/LanguageCorrection/TextProcessor.java +4 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ import java.util.Set; public class TextProcessor { public static List<String> extractSentences(String text) { List<String> sentences = List.of(text.split("\\.")); List<String> sentences = List.of(text.split("([.!?] )|([.!?]\n)")); return new ArrayList<>(sentences); } Loading @@ -14,6 +14,9 @@ public class TextProcessor { public static List<String> extractPhrases(String sentence, int minN, int maxN) { // Using a Set to avoid duplicate phrases Set<String> phraseSet = new HashSet<>(); // Remove punctuation sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", ""); // Split the sentence into words String[] words = sentence.split("\\s+"); // Loop over the range of n values Loading Loading
checker_test_file.txt 0 → 100644 +2 −0 Original line number Diff line number Diff line Sorry for the terrible inconvenience but this site is still under super development. be right back. Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development. No newline at end of file
src/main/java/edu/bu/LanguageCorrection/AnomalyDetector.javadeleted 100644 → 0 +0 −66 Original line number Diff line number Diff line package edu.bu.LanguageCorrection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class AnomalyDetector { private static final Set<String> commonWords = new HashSet<>(); private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later static { // simple list of common words for demonstration purposes commonWords.add("the"); commonWords.add("be"); commonWords.add("to"); commonWords.add("of"); commonWords.add("and"); // we can add more later } public Map<String, Integer> analyzeSentences(List<String> sentences) { Map<String, Integer> sentenceScores = new HashMap<>(); for (String sentence : sentences) { int score = 0; // length variance score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length // word rarity String[] words = sentence.split("\\s+"); for (String word : words) { if (!commonWords.contains(word.toLowerCase())) { score += 10; // Increment score for each UNCOMMON word } } sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize) } return sentenceScores; } public Map<String, Integer> analyzePhrases(List<String> sentences) { Map<String, Integer> phraseScores = new HashMap<>(); for (String sentence : sentences) { List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); for (String phrase : phrases) { int score = 0; // Score length variance like above if (phrase.length() < AVERAGE_WORD_LENGTH || phrase.length() > AVERAGE_WORD_LENGTH * 3) { score += 20; } // Word rarity String[] words = phrase.split("\\s+"); for (String word : words) { if (!commonWords.contains(word.toLowerCase())) { score += 5; } } phraseScores.put(phrase, Math.min(score, 100)); // Normalize } } return phraseScores; } }
src/main/java/edu/bu/LanguageCorrection/Checker.java +68 −7 Original line number Diff line number Diff line Loading @@ -4,17 +4,45 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.ArrayList; import java.util.zip.Inflater; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import edu.bu.LanguageCorrection.AnomalyDetector; import edu.bu.LanguageCorrection.TextProcessor; public class Checker { public void analyze(String text) { List<String> sentences = TextProcessor.extractSentences(text); AnomalyDetector detector = new AnomalyDetector(); Map<String, Integer> sentenceScores = detector.analyzeSentences(sentences); Map<String, Integer> phraseScores = detector.analyzePhrases(sentences); TrieNode detector = loadFile("metadata.ser"); Map<String, Float> sentenceScores = new HashMap<>(); Map<String, Float> phraseScores = new HashMap<>(); for (String sentence : sentences) { //System.out.println("Analyzing sentence: " + sentence); List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3); // Calculate perplexity (score) for each phrase for (String phrase : phrases) { // System.out.println("Analyzing phrase: " + phrase); float perplexity = detector.perplexity(phrase); if (perplexity > 100) { phraseScores.put(phrase, 100f); } else { phraseScores.put(phrase, perplexity); } } // Calculate average perplexity for the sentence float sentenceScore = 0; for (String phrase : phrases) { sentenceScore += phraseScores.get(phrase); } sentenceScore /= phrases.size(); sentenceScores.put(sentence, sentenceScore); } // Output results in JSON format System.out.println("{"); Loading @@ -22,9 +50,9 @@ public class Checker { System.out.println("\"phrases\": " + mapToJson(phraseScores)); System.out.println("}"); } private static String mapToJson(Map<String, Integer> map) { private static String mapToJson(Map<String, Float> map) { StringBuilder jsonBuilder = new StringBuilder("{"); for (Map.Entry<String, Integer> entry : map.entrySet()) { for (Map.Entry<String, Float> entry : map.entrySet()) { jsonBuilder.append("\"" + entry.getKey() + "\": " + entry.getValue() + ","); } jsonBuilder.deleteCharAt(jsonBuilder.length() - 1); // remove last comma Loading @@ -32,6 +60,39 @@ public class Checker { return jsonBuilder.toString(); } private static byte[] decompress(byte[] compressedData) { Inflater decompressor = new Inflater(); decompressor.setInput(compressedData); ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length); byte[] buf = new byte[1024]; try { while (!decompressor.finished()) { int count = decompressor.inflate(buf); bos.write(buf, 0, count); } decompressor.end(); return bos.toByteArray(); } catch (Exception e) { System.err.println("Error decompressing data: " + e.getMessage()); return new byte[0]; } } private TrieNode loadFile(String filePath) { TrieNode trie = new TrieNode(); try (FileInputStream fis = new FileInputStream(filePath)) { byte[] compressedData = fis.readAllBytes(); byte[] decompressedData = decompress(compressedData); trie.deserialize(decompressedData); System.out.println("Metadata loaded successfully."); return trie; } catch (IOException e) { System.err.println("Error reading metadata from file: " + e.getMessage()); return new TrieNode(); } } public static void main(String[] args) { if (args.length > 1 && "--file".equals(args[0])) { // check syntax String path = args[1]; Loading
src/main/java/edu/bu/LanguageCorrection/TextProcessor.java +4 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ import java.util.Set; public class TextProcessor { public static List<String> extractSentences(String text) { List<String> sentences = List.of(text.split("\\.")); List<String> sentences = List.of(text.split("([.!?] )|([.!?]\n)")); return new ArrayList<>(sentences); } Loading @@ -14,6 +14,9 @@ public class TextProcessor { public static List<String> extractPhrases(String sentence, int minN, int maxN) { // Using a Set to avoid duplicate phrases Set<String> phraseSet = new HashSet<>(); // Remove punctuation sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", ""); // Split the sentence into words String[] words = sentence.split("\\s+"); // Loop over the range of n values Loading