implemented checker (66ce634e) · Commits · EC504 Spring 2024 Group Projects / Group7

checker_test_file.txt

0 → 100644

+2 −0

Sorry for the terrible inconvenience but this site is still under super development. be right back.

Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development.

 No newline at end of file

src/main/java/edu/bu/LanguageCorrection/AnomalyDetector.java

deleted100644 → 0

+0 −66

Original line number	Diff line number	Diff line
		package edu.bu.LanguageCorrection;
		import java.util.HashMap;
		import java.util.HashSet;
		import java.util.List;
		import java.util.Map;
		import java.util.Set;

		public class AnomalyDetector {

		private static final Set<String> commonWords = new HashSet<>();
		private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later

		static { // simple list of common words for demonstration purposes
		commonWords.add("the");
		commonWords.add("be");
		commonWords.add("to");
		commonWords.add("of");
		commonWords.add("and"); // we can add more later
		}

		public Map<String, Integer> analyzeSentences(List<String> sentences) {
		Map<String, Integer> sentenceScores = new HashMap<>();
		for (String sentence : sentences) {
		int score = 0;
		// length variance
		score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length

		// word rarity
		String[] words = sentence.split("\\s+");
		for (String word : words) {
		if (!commonWords.contains(word.toLowerCase())) {
		score += 10; // Increment score for each UNCOMMON word
		}
		}

		sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize)
		}
		return sentenceScores;
		}

		public Map<String, Integer> analyzePhrases(List<String> sentences) {
		Map<String, Integer> phraseScores = new HashMap<>();

		for (String sentence : sentences) {
		List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

		for (String phrase : phrases) {
		int score = 0;
		// Score length variance like above
		if (phrase.length() < AVERAGE_WORD_LENGTH \|\| phrase.length() > AVERAGE_WORD_LENGTH * 3) {
		score += 20;
		}
		// Word rarity
		String[] words = phrase.split("\\s+");
		for (String word : words) {
		if (!commonWords.contains(word.toLowerCase())) {
		score += 5;
		}
		}
		phraseScores.put(phrase, Math.min(score, 100)); // Normalize
		}
		}

		return phraseScores;
		}
		}

src/main/java/edu/bu/LanguageCorrection/Checker.java

+68 −7

Original line number	Diff line number	Diff line
		@@ -4,17 +4,45 @@ import java.nio.file.Files;
		import java.nio.file.Paths;
		import java.util.List;
		import java.util.Map;
		import java.util.HashMap;
		import java.util.ArrayList;
		import java.util.zip.Inflater;
		import java.io.ByteArrayOutputStream;
		import java.io.FileInputStream;

		import edu.bu.LanguageCorrection.AnomalyDetector;
		import edu.bu.LanguageCorrection.TextProcessor;

		public class Checker {
		public void analyze(String text) {
		List<String> sentences = TextProcessor.extractSentences(text);
		AnomalyDetector detector = new AnomalyDetector();

		Map<String, Integer> sentenceScores = detector.analyzeSentences(sentences);
		Map<String, Integer> phraseScores = detector.analyzePhrases(sentences);
		TrieNode detector = loadFile("metadata.ser");

		Map<String, Float> sentenceScores = new HashMap<>();
		Map<String, Float> phraseScores = new HashMap<>();

		for (String sentence : sentences) {
		//System.out.println("Analyzing sentence: " + sentence);
		List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

		// Calculate perplexity (score) for each phrase
		for (String phrase : phrases) {
		// System.out.println("Analyzing phrase: " + phrase);
		float perplexity = detector.perplexity(phrase);
		if (perplexity > 100) {
		phraseScores.put(phrase, 100f);
		} else {
		phraseScores.put(phrase, perplexity);
		}
		}

		// Calculate average perplexity for the sentence
		float sentenceScore = 0;
		for (String phrase : phrases) {
		sentenceScore += phraseScores.get(phrase);
		}
		sentenceScore /= phrases.size();
		sentenceScores.put(sentence, sentenceScore);
		}

		// Output results in JSON format
		System.out.println("{");
		@@ -22,9 +50,9 @@ public class Checker {
		System.out.println("\"phrases\": " + mapToJson(phraseScores));
		System.out.println("}");
		}
		private static String mapToJson(Map<String, Integer> map) {
		private static String mapToJson(Map<String, Float> map) {
		StringBuilder jsonBuilder = new StringBuilder("{");
		for (Map.Entry<String, Integer> entry : map.entrySet()) {
		for (Map.Entry<String, Float> entry : map.entrySet()) {
		jsonBuilder.append("\"" + entry.getKey() + "\": " + entry.getValue() + ",");
		}
		jsonBuilder.deleteCharAt(jsonBuilder.length() - 1); // remove last comma
		@@ -32,6 +60,39 @@ public class Checker {

		return jsonBuilder.toString();
		}
		private static byte[] decompress(byte[] compressedData) {
		Inflater decompressor = new Inflater();
		decompressor.setInput(compressedData);

		ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length);

		byte[] buf = new byte[1024];
		try {
		while (!decompressor.finished()) {
		int count = decompressor.inflate(buf);
		bos.write(buf, 0, count);
		}
		decompressor.end();
		return bos.toByteArray();
		} catch (Exception e) {
		System.err.println("Error decompressing data: " + e.getMessage());
		return new byte[0];
		}
		}

		private TrieNode loadFile(String filePath) {
		TrieNode trie = new TrieNode();
		try (FileInputStream fis = new FileInputStream(filePath)) {
		byte[] compressedData = fis.readAllBytes();
		byte[] decompressedData = decompress(compressedData);
		trie.deserialize(decompressedData);
		System.out.println("Metadata loaded successfully.");
		return trie;
		} catch (IOException e) {
		System.err.println("Error reading metadata from file: " + e.getMessage());
		return new TrieNode();
		}
		}
		public static void main(String[] args) {
		if (args.length > 1 && "--file".equals(args[0])) { // check syntax
		String path = args[1];

src/main/java/edu/bu/LanguageCorrection/TextProcessor.java

+4 −1

Original line number	Diff line number	Diff line
		@@ -6,7 +6,7 @@ import java.util.Set;

		public class TextProcessor {
		public static List<String> extractSentences(String text) {
		List<String> sentences = List.of(text.split("\\."));
		List<String> sentences = List.of(text.split("([.!?] )\|([.!?]\n)"));
		return new ArrayList<>(sentences);
		}

		@@ -14,6 +14,9 @@ public class TextProcessor {
		public static List<String> extractPhrases(String sentence, int minN, int maxN) {
		// Using a Set to avoid duplicate phrases
		Set<String> phraseSet = new HashSet<>();
		// Remove punctuation
		sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", "");
		// Split the sentence into words
		String[] words = sentence.split("\\s+");

		// Loop over the range of n values