Commit 0efd452b authored by Moises Bensadon's avatar Moises Bensadon
Browse files

Merge branch 'checker-module' into 'crawler-module'

Integrate Checker with Crawler

See merge request ec504/ec504_projects/group7!1
parents fe408833 9eaf12e6
Loading
Loading
Loading
Loading
+65 −0
Original line number Diff line number Diff line
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AnomalyDetector {

    private static final Set<String> commonWords = new HashSet<>();
    private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later

    static { // simple list of common words for demonstration purposes
        commonWords.add("the");
        commonWords.add("be");
        commonWords.add("to");
        commonWords.add("of");
        commonWords.add("and"); // we can add more later
    }

    public Map<String, Integer> analyzeSentences(List<String> sentences) {
        Map<String, Integer> sentenceScores = new HashMap<>();
        for (String sentence : sentences) {
            int score = 0;
            // length variance
            score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length

            // word rarity
            String[] words = sentence.split("\\s+");
            for (String word : words) {
                if (!commonWords.contains(word.toLowerCase())) {
                    score += 10; // Increment score for each UNCOMMON word
                }
            }

            sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize)
        }
        return sentenceScores;
    }

    public Map<String, Integer> analyzePhrases(List<String> sentences) {
        Map<String, Integer> phraseScores = new HashMap<>();

        for (String sentence : sentences) {
            List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

            for (String phrase : phrases) {
                int score = 0;
                // Score length variance like above
                if (phrase.length() < AVERAGE_WORD_LENGTH || phrase.length() > AVERAGE_WORD_LENGTH * 3) {
                    score += 20;
                }
                // Word rarity
                String[] words = phrase.split("\\s+");
                for (String word : words) {
                    if (!commonWords.contains(word.toLowerCase())) {
                        score += 5;
                    }
                }
                phraseScores.put(phrase, Math.min(score, 100)); // Normalize
            }
        }

        return phraseScores;
    }
}

Checker/Checker.java

0 → 100644
+47 −0
Original line number Diff line number Diff line
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;

public class Checker {
    public void analyze(String text) {
        List<String> sentences = TextProcessor.extractSentences(text);
        AnomalyDetector detector = new AnomalyDetector();

        Map<String, Integer> sentenceScores = detector.analyzeSentences(sentences);
        Map<String, Integer> phraseScores = detector.analyzePhrases(sentences);

        // Output results in JSON format
        System.out.println("{");
        System.out.println("\"sentences\": " + mapToJson(sentenceScores) + ",");
        System.out.println("\"phrases\": " + mapToJson(phraseScores));
        System.out.println("}");
    }
    private static String mapToJson(Map<String, Integer> map) {
        StringBuilder jsonBuilder = new StringBuilder("{");
        for (Map.Entry<String, Integer> entry : map.entrySet()) {
            jsonBuilder.append("\"" + entry.getKey() + "\": " + entry.getValue() + ",");
        }
        jsonBuilder.deleteCharAt(jsonBuilder.length() - 1); // remove last comma
        jsonBuilder.append("}");

        return jsonBuilder.toString();
    }
    public static void main(String[] args) {
        if (args.length > 1 && "--file".equals(args[0])) { // check syntax
            String path = args[1];
            try {
                // Read entire file
                String content = new String(Files.readAllBytes(Paths.get(path)));

                Checker checker = new Checker(); // Run checker
                checker.analyze(content);
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
            }
        } else {
            System.out.println("Invalid arguments. Usage: CLI --file [path]");
        }
    }
}

Checker/Corrector.java

0 → 100644
+140 −0
Original line number Diff line number Diff line
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

public class Corrector {
    private Map<String, Double> trigramProbabilities;
    private Map<String, Double> bigramProbabilities;
    private Map<String, Double> unigramProbabilities;

    private static final double BACKOFF_PENALTY = 0.1;

    public Corrector() {
        trigramProbabilities = new HashMap<>();
        bigramProbabilities = new HashMap<>();
        unigramProbabilities = new HashMap<>();
        loadBrown();
    }

    private void loadBrown() {
        try (BufferedReader br = new BufferedReader(new FileReader("Checker/brown.txt"))) {
            String line;
            Map<String, Integer> bigramCounts = new HashMap<>();
            Map<String, Integer> trigramCounts = new HashMap<>();
            Map<String, Integer> unigramCounts = new HashMap<>();

            while ((line = br.readLine()) != null) {
                String[] words = line.split("\\s+");
                for (String word : words) {
                    String lowerCaseWord = word.toLowerCase();
                    unigramCounts.put(lowerCaseWord, unigramCounts.getOrDefault(lowerCaseWord, 0) + 1);
                }

                if (words.length < 3) continue; // Skip lines with less than 3 words

                for (int i = 0; i < words.length - 1; i++) {
                    String bigram = words[i].toLowerCase() + " " + words[i + 1].toLowerCase();
                    bigramCounts.put(bigram, bigramCounts.getOrDefault(bigram, 0) + 1);
                }

                for (int i = 0; i < words.length - 2; i++) {
                    String trigram = words[i].toLowerCase() + " " +
                            words[i + 1].toLowerCase() + " " +
                            words[i + 2].toLowerCase();
                    trigramCounts.put(trigram, trigramCounts.getOrDefault(trigram, 0) + 1);
                }
            }

            int totalUnigrams = unigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalBigrams = bigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalTrigrams = trigramCounts.values().stream().mapToInt(Integer::intValue).sum();

            for (Map.Entry<String, Integer> entry : unigramCounts.entrySet()) {
                unigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalUnigrams);
            }

            for (Map.Entry<String, Integer> entry : bigramCounts.entrySet()) {
                bigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalBigrams);
            }

            for (Map.Entry<String, Integer> entry : trigramCounts.entrySet()) {
                trigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalTrigrams);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public String correct(String input) {
        StringBuilder correctedSentence = new StringBuilder();
        String[] words = input.split("\\s+");

        for (int i = 0; i < words.length - 2; i++) {
            String trigram = words[i].toLowerCase() + " " +
                    words[i + 1].toLowerCase() + " " +
                    words[i + 2].toLowerCase();

            if (!trigramProbabilities.containsKey(trigram)) {
                correctedSentence.append(suggestCorrection(words[i], words[i + 1], words[i + 2])).append(" ");
            } else {
                correctedSentence.append(words[i]).append(" ");
                correctedSentence.append(words[i + 1]).append(" ");
                correctedSentence.append(words[i + 2]).append(" ");
            }
        }

        return correctedSentence.toString().trim();
    }

    private String suggestCorrection(String word1, String word2, String word3) {
        // Trigram, Bigram, and Unigram perplexities
        double trigramPerplexity = calculatePerplexity(trigramProbabilities, word1, word2, word3);
        double bigramPerplexity = calculatePerplexity(bigramProbabilities, word1, word2, "");
        double unigramPerplexity = calculatePerplexity(unigramProbabilities, word1, "", "");

        if (trigramPerplexity <= bigramPerplexity && trigramPerplexity <= unigramPerplexity) {
            return word1;
        } else if (bigramPerplexity <= unigramPerplexity) {
            return word2;
        } else {
            return word3;
        }
    }

    private double calculatePerplexity(Map<String, Double> probabilities, String word1, String word2, String word3) {
        String trigram = word1.toLowerCase() + " " + word2.toLowerCase() + " " + word3;
        double probability = probabilities.getOrDefault(trigram, 0.0);

        // If probability is zero, BACKOFF
        if (probability == 0.0) {
            String bigram = word1.toLowerCase() + " " + word2.toLowerCase();
            probability = probabilities.getOrDefault(bigram, 0.0) * BACKOFF_PENALTY;
            if (probability == 0.0) {
                probability = unigramProbabilities.getOrDefault(word1.toLowerCase(), 0.0) * BACKOFF_PENALTY * BACKOFF_PENALTY;
            }
        }
        // Perplexity
        return 1.0 / probability;
    }

    public static void main(String[] args) {
        if (args.length > 1 && "--file".equals(args[0])) { // check syntax
            String path = args[1];
            try {
                String content = new String(Files.readAllBytes(Paths.get(path)));
                Corrector corrector = new Corrector(); // Run corrector
                String corrected = corrector.correct(content);
                System.out.println(corrected);
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
            }
        } else {
            System.out.println("Invalid arguments. Usage: CLI --file [path]");
        }
    }
}
 No newline at end of file
+32 −0
Original line number Diff line number Diff line
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class TextProcessor {
    public static List<String> extractSentences(String text) {
        List<String> sentences = List.of(text.split("\\."));
        return new ArrayList<>(sentences);
    }

    // Extracts phrases with variable lengths using n-gram method
    public static List<String> extractPhrases(String sentence, int minN, int maxN) {
        // Using a Set to avoid duplicate phrases
        Set<String> phraseSet = new HashSet<>();
        String[] words = sentence.split("\\s+");

        // Loop over the range of n values
        for (int n = minN; n <= maxN; n++) {
            for (int i = 0; i < words.length - n + 1; i++) {
                StringBuilder sb = new StringBuilder();
                for (int j = i; j < i + n; j++) {
                    sb.append((j > i ? " " : "") + words[j]);
                }
                phraseSet.add(sb.toString());
            }
        }

        // Convert the set back to a list to maintain the original interface
        return new ArrayList<>(phraseSet);
    }
}

Checker/brown.txt

0 → 100644
+99007 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading