Commit 66ce634e authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

implemented checker

parent 5c210d2f
Loading
Loading
Loading
Loading

checker_test_file.txt

0 → 100644
+2 −0
Original line number Diff line number Diff line
Sorry for the terrible inconvenience but this site is still under super development. be right back.
Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development.
 No newline at end of file
+0 −66
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AnomalyDetector {

    private static final Set<String> commonWords = new HashSet<>();
    private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later

    static { // simple list of common words for demonstration purposes
        commonWords.add("the");
        commonWords.add("be");
        commonWords.add("to");
        commonWords.add("of");
        commonWords.add("and"); // we can add more later
    }

    public Map<String, Integer> analyzeSentences(List<String> sentences) {
        Map<String, Integer> sentenceScores = new HashMap<>();
        for (String sentence : sentences) {
            int score = 0;
            // length variance
            score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length

            // word rarity
            String[] words = sentence.split("\\s+");
            for (String word : words) {
                if (!commonWords.contains(word.toLowerCase())) {
                    score += 10; // Increment score for each UNCOMMON word
                }
            }

            sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize)
        }
        return sentenceScores;
    }

    public Map<String, Integer> analyzePhrases(List<String> sentences) {
        Map<String, Integer> phraseScores = new HashMap<>();

        for (String sentence : sentences) {
            List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

            for (String phrase : phrases) {
                int score = 0;
                // Score length variance like above
                if (phrase.length() < AVERAGE_WORD_LENGTH || phrase.length() > AVERAGE_WORD_LENGTH * 3) {
                    score += 20;
                }
                // Word rarity
                String[] words = phrase.split("\\s+");
                for (String word : words) {
                    if (!commonWords.contains(word.toLowerCase())) {
                        score += 5;
                    }
                }
                phraseScores.put(phrase, Math.min(score, 100)); // Normalize
            }
        }

        return phraseScores;
    }
}
+68 −7
Original line number Diff line number Diff line
@@ -4,17 +4,45 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.zip.Inflater;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;

import edu.bu.LanguageCorrection.AnomalyDetector;
import edu.bu.LanguageCorrection.TextProcessor;

public class Checker {
    public void analyze(String text) {
        List<String> sentences = TextProcessor.extractSentences(text);
        AnomalyDetector detector = new AnomalyDetector();

        Map<String, Integer> sentenceScores = detector.analyzeSentences(sentences);
        Map<String, Integer> phraseScores = detector.analyzePhrases(sentences);
        TrieNode detector = loadFile("metadata.ser");

        Map<String, Float> sentenceScores = new HashMap<>();
        Map<String, Float> phraseScores = new HashMap<>();

        for (String sentence : sentences) {
            //System.out.println("Analyzing sentence: " + sentence);
            List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

            // Calculate perplexity (score) for each phrase
            for (String phrase : phrases) {
                // System.out.println("Analyzing phrase: " + phrase);
                float perplexity = detector.perplexity(phrase);
                if (perplexity > 100) {
                    phraseScores.put(phrase, 100f);
                } else {
                    phraseScores.put(phrase, perplexity);
                }
            }

            // Calculate average perplexity for the sentence
            float sentenceScore = 0;
            for (String phrase : phrases) {
                sentenceScore += phraseScores.get(phrase);
            }
            sentenceScore /= phrases.size();
            sentenceScores.put(sentence, sentenceScore);
        }

        // Output results in JSON format
        System.out.println("{");
@@ -22,9 +50,9 @@ public class Checker {
        System.out.println("\"phrases\": " + mapToJson(phraseScores));
        System.out.println("}");
    }
    private static String mapToJson(Map<String, Integer> map) {
    private static String mapToJson(Map<String, Float> map) {
        StringBuilder jsonBuilder = new StringBuilder("{");
        for (Map.Entry<String, Integer> entry : map.entrySet()) {
        for (Map.Entry<String, Float> entry : map.entrySet()) {
            jsonBuilder.append("\"" + entry.getKey() + "\": " + entry.getValue() + ",");
        }
        jsonBuilder.deleteCharAt(jsonBuilder.length() - 1); // remove last comma
@@ -32,6 +60,39 @@ public class Checker {

        return jsonBuilder.toString();
    }
    private static byte[] decompress(byte[] compressedData) {
        Inflater decompressor = new Inflater();
        decompressor.setInput(compressedData);

        ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length);

        byte[] buf = new byte[1024];
        try {
            while (!decompressor.finished()) {
                int count = decompressor.inflate(buf);
                bos.write(buf, 0, count);
            }
            decompressor.end();
            return bos.toByteArray();
        } catch (Exception e) {
            System.err.println("Error decompressing data: " + e.getMessage());
            return new byte[0];
        }
    }

    private TrieNode loadFile(String filePath) {
        TrieNode trie = new TrieNode();
        try (FileInputStream fis = new FileInputStream(filePath)) {
            byte[] compressedData = fis.readAllBytes();
            byte[] decompressedData = decompress(compressedData);
            trie.deserialize(decompressedData);
            System.out.println("Metadata loaded successfully.");
            return trie;
        } catch (IOException e) {
            System.err.println("Error reading metadata from file: " + e.getMessage());
            return new TrieNode();
        }
    }
    public static void main(String[] args) {
        if (args.length > 1 && "--file".equals(args[0])) { // check syntax
            String path = args[1];
+4 −1
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ import java.util.Set;

public class TextProcessor {
    public static List<String> extractSentences(String text) {
        List<String> sentences = List.of(text.split("\\."));
        List<String> sentences = List.of(text.split("([.!?] )|([.!?]\n)"));
        return new ArrayList<>(sentences);
    }

@@ -14,6 +14,9 @@ public class TextProcessor {
    public static List<String> extractPhrases(String sentence, int minN, int maxN) {
        // Using a Set to avoid duplicate phrases
        Set<String> phraseSet = new HashSet<>();
        // Remove punctuation
        sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", "");    
        // Split the sentence into words
        String[] words = sentence.split("\\s+");

        // Loop over the range of n values