Commit 5cd67ecc authored by Moises Bensadon's avatar Moises Bensadon
Browse files

initial corrector



Co-authored-by: default avatarManuel Segimon <manuelsp@bu.edu>
parent a972482c
Loading
Loading
Loading
Loading

Checker/Corrector.java

0 → 100644
+130 −0
Original line number Diff line number Diff line
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class Corrector {
    private Map<String, Double> trigramProbabilities;
    private Map<String, Double> bigramProbabilities;
    private Map<String, Double> unigramProbabilities;

    private static final double BACKOFF_PENALTY = 0.1;

    public Corrector(String filePath) {
        trigramProbabilities = new HashMap<>();
        bigramProbabilities = new HashMap<>();
        unigramProbabilities = new HashMap<>();
        loadBrown();
    }

    private void loadBrown() {
        try (BufferedReader br = new BufferedReader(new FileReader("Checker/brown.txt"))) {
            String line;
            Map<String, Integer> bigramCounts = new HashMap<>();
            Map<String, Integer> trigramCounts = new HashMap<>();
            Map<String, Integer> unigramCounts = new HashMap<>();

            while ((line = br.readLine()) != null) {
                String[] words = line.split("\\s+");
                for (String word : words) {
                    String lowerCaseWord = word.toLowerCase();
                    unigramCounts.put(lowerCaseWord, unigramCounts.getOrDefault(lowerCaseWord, 0) + 1);
                }

                if (words.length < 3) continue; // Skip lines with less than 3 words

                for (int i = 0; i < words.length - 1; i++) {
                    String bigram = words[i].toLowerCase() + " " + words[i + 1].toLowerCase();
                    bigramCounts.put(bigram, bigramCounts.getOrDefault(bigram, 0) + 1);
                }

                for (int i = 0; i < words.length - 2; i++) {
                    String trigram = words[i].toLowerCase() + " " +
                            words[i + 1].toLowerCase() + " " +
                            words[i + 2].toLowerCase();
                    trigramCounts.put(trigram, trigramCounts.getOrDefault(trigram, 0) + 1);
                }
            }

            int totalUnigrams = unigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalBigrams = bigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalTrigrams = trigramCounts.values().stream().mapToInt(Integer::intValue).sum();

            for (Map.Entry<String, Integer> entry : unigramCounts.entrySet()) {
                unigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalUnigrams);
            }

            for (Map.Entry<String, Integer> entry : bigramCounts.entrySet()) {
                bigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalBigrams);
            }

            for (Map.Entry<String, Integer> entry : trigramCounts.entrySet()) {
                trigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalTrigrams);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public String correct(String input) {
        StringBuilder correctedSentence = new StringBuilder();
        String[] words = input.split("\\s+");

        for (int i = 0; i < words.length - 2; i++) {
            String trigram = words[i].toLowerCase() + " " +
                    words[i + 1].toLowerCase() + " " +
                    words[i + 2].toLowerCase();

            if (!trigramProbabilities.containsKey(trigram)) {
                correctedSentence.append(suggestCorrection(words[i], words[i + 1], words[i + 2])).append(" ");
            } else {
                correctedSentence.append(words[i]).append(" ");
                correctedSentence.append(words[i + 1]).append(" ");
                correctedSentence.append(words[i + 2]).append(" ");
            }
        }

        return correctedSentence.toString().trim();
    }

    private String suggestCorrection(String word1, String word2, String word3) {
        // Trigram, Bigram, and Unigram perplexities
        double trigramPerplexity = calculatePerplexity(trigramProbabilities, word1, word2, word3);
        double bigramPerplexity = calculatePerplexity(bigramProbabilities, word1, word2, "");
        double unigramPerplexity = calculatePerplexity(unigramProbabilities, word1, "", "");

        if (trigramPerplexity <= bigramPerplexity && trigramPerplexity <= unigramPerplexity) {
            return word1;
        } else if (bigramPerplexity <= unigramPerplexity) {
            return word2;
        } else {
            return word3;
        }
    }

    private double calculatePerplexity(Map<String, Double> probabilities, String word1, String word2, String word3) {
        String trigram = word1.toLowerCase() + " " + word2.toLowerCase() + " " + word3;
        double probability = probabilities.getOrDefault(trigram, 0.0);

        // If probability is zero, BACKOFF
        if (probability == 0.0) {
            String bigram = word1.toLowerCase() + " " + word2.toLowerCase();
            probability = probabilities.getOrDefault(bigram, 0.0) * BACKOFF_PENALTY;
            if (probability == 0.0) {
                probability = unigramProbabilities.getOrDefault(word1.toLowerCase(), 0.0) * BACKOFF_PENALTY * BACKOFF_PENALTY;
            }
        }

        // Perplexity
        return 1.0 / probability;
    }

    public static void main(String[] args) {
        Corrector corrector = new Corrector("brown.txt");
        String input = "This is strange so choice";
        String corrected = corrector.correct(input);
        System.out.println("Corrected: " + corrected);
    }
}

Checker/brown.txt

0 → 100644
+99007 −0

File added.

Preview size limit exceeded, changes collapsed.