Commit 7c83c077 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Merge branch 'master' into '13-gui'

# Conflicts:
#   src/main/java/edu/bu/LanguageCorrection/Checker.java
#   src/main/java/edu/bu/LanguageCorrection/crawler.java
parents a24227ab a18c93ca
Loading
Loading
Loading
Loading
+41 −18
Original line number Diff line number Diff line
The INSTALL.txt file should contain all information needed to install and run your code, from scratch, on a lab machine, including:
- Pre-conditions
  - Hardware, peripherals, and operating system restrictions for running your code ( i.e., can this code only run on lab machines, or can it run elsewhere?)
    - All code should run on our lab computers, unless you have an exemption from the instructor.
    - If some of your features work more efficiently on specific hardware, please explain this here.

- Supporting files
  - A list of non-standard libraries needed for your project to run, including:
    - Clear and simple instructions for how to freely (and legally!) acquire and install them from source code with minimal effort.
    - You may additionally link to a binary version of the libraries, if you wish.
  - Examples of how to use your project
    - Several clear examples the illustrate the main features of your project.
  - Descriptions of testing patterns, and instructions on how to exercise them:
    - unit tests
    - system tests
    
- Execution
  - Clear and terse instructions on how an average student in the class can compile and run all your code, from scratch, on a lab machine.
 No newline at end of file
INSTALL.txt for Language Correction Tool

Pre-conditions
  - Hardware and OS Requirements:
    - The application runs on lab computers equipped with CentOS. It is cross-platform and compatible with any system that supports Java 17.

  - Java Requirement:
    - Java JDK 17 is required.

Supporting Files
  - External Libraries:
    - Jsoup: For HTML parsing.

Installation Instructions
  - Install Java JDK 17:
    - Follow the download and installation instructions from Oracle's website (https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html)
    - Set up JAVA_HOME and update your system's PATH.

  - Install Maven:
    - Download and install via Maven (https://maven.apache.org/download.cgi)
    - Follow the detailed installation instructions on the Maven website.
    - Ensure Maven's bin directory is in your system's PATH.

  - Setup Project:
    - Download or clone the project repository.
    - Navigate to the project directory (where pom.xml is located).
    - Run mvn clean install to resolve dependencies and build the project.

Execution Instructions
  - Launch the application:
    - Open a terminal.
    - Change to the directory containing the project's compiled classes.
    - Run the application using Maven.

Usage Examples
  - Starting the Application:
    - Execute the application as per the instructions above.
    - Select a module from the drop-down menu (Crawler, Checker, or Corrector).
    - Enter a URL or a local file path in the text field.
    - Click Run to execute the selected module.
    - Have fun!!!
+148 −90
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;
import java.io.BufferedReader;
import java.io.FileReader;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.zip.Inflater;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.PriorityQueue;
import java.util.Collections;
import java.util.HashMap;

public class Corrector {
    private Map<String, Double> trigramProbabilities;
    private Map<String, Double> bigramProbabilities;
    private Map<String, Double> unigramProbabilities;

    private static final double BACKOFF_PENALTY = 0.1;
public class Corrector {
    private TrieNode detector;

    public Corrector() {
        trigramProbabilities = new HashMap<>();
        bigramProbabilities = new HashMap<>();
        unigramProbabilities = new HashMap<>();
        loadBrown();
        detector = loadFile("metadata.ser");
    }

    private void loadBrown() {
        try (BufferedReader br = new BufferedReader(new FileReader("Checker/brown.txt"))) {
            String line;
            Map<String, Integer> bigramCounts = new HashMap<>();
            Map<String, Integer> trigramCounts = new HashMap<>();
            Map<String, Integer> unigramCounts = new HashMap<>();

            while ((line = br.readLine()) != null) {
                String[] words = line.split("\\s+");
                for (String word : words) {
                    String lowerCaseWord = word.toLowerCase();
                    unigramCounts.put(lowerCaseWord, unigramCounts.getOrDefault(lowerCaseWord, 0) + 1);
    private TrieNode loadFile(String filePath) {
        TrieNode trie = new TrieNode();
        try (FileInputStream fis = new FileInputStream(filePath)) {
            byte[] compressedData = fis.readAllBytes();
            byte[] decompressedData = decompress(compressedData);
            trie.deserialize(decompressedData);
            System.out.println("Metadata loaded successfully.");
            return trie;
        } catch (IOException e) {
            System.err.println("Error reading metadata from file: " + e.getMessage());
            return new TrieNode();
        }
    }

    private static byte[] decompress(byte[] compressedData) {
        Inflater decompressor = new Inflater();
        decompressor.setInput(compressedData);

                if (words.length < 3) continue; // Skip lines with less than 3 words
        ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length);

                for (int i = 0; i < words.length - 1; i++) {
                    String bigram = words[i].toLowerCase() + " " + words[i + 1].toLowerCase();
                    bigramCounts.put(bigram, bigramCounts.getOrDefault(bigram, 0) + 1);
        byte[] buf = new byte[1024];
        try {
            while (!decompressor.finished()) {
                int count = decompressor.inflate(buf);
                bos.write(buf, 0, count);
            }
            decompressor.end();
            return bos.toByteArray();
        } catch (Exception e) {
            System.err.println("Error decompressing data: " + e.getMessage());
            return new byte[0];
        }
    }

    public String[] correct(String inputSentence) {
        // Divide sentence into words
        String[] words = inputSentence.split(" ");

        List<String> sentences = generateSentences(words);

        // Use a priority queue to store sentences along with their scores
        PriorityQueue<SentenceScorePair> pq = new PriorityQueue<>(
                Collections.reverseOrder());

        for (String sentence : sentences) {
            float score = detector.perplexity(sentence);
            // System.out.println(sentence + " | Score: " + score);

            pq.offer(new SentenceScorePair(sentence, score));

                for (int i = 0; i < words.length - 2; i++) {
                    String trigram = words[i].toLowerCase() + " " +
                            words[i + 1].toLowerCase() + " " +
                            words[i + 2].toLowerCase();
                    trigramCounts.put(trigram, trigramCounts.getOrDefault(trigram, 0) + 1);
            // Ensure only the top 5 sentences are kept, remove the worst if more than 5
            if (pq.size() > 5) {
                pq.poll();
            }
        }

            int totalUnigrams = unigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalBigrams = bigramCounts.values().stream().mapToInt(Integer::intValue).sum();
            int totalTrigrams = trigramCounts.values().stream().mapToInt(Integer::intValue).sum();
        // Print the top sentences with their scores
        SentenceScorePair[] topPairs = new SentenceScorePair[pq.size()];
        int index = pq.size() - 1;
        while (!pq.isEmpty()) {
            topPairs[index] = pq.poll();
            index--;
        }

        // Print sentences in the right order
        // for (SentenceScorePair pair : topPairs) {
        //     System.out.println(pair.sentence + " | Score: " + pair.score);
        // }

            for (Map.Entry<String, Integer> entry : unigramCounts.entrySet()) {
                unigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalUnigrams);
        // Convert to array of just sentences
        String[] topSentences = new String[topPairs.length];
        for (int i = 0; i < topPairs.length; i++) {
            // Remove sentence that are not in the 0.1 percentile of the best sentence
            if (topPairs[i].score > topPairs[0].score * 1.5) {
                topSentences[i] = "";
            } else {
                topSentences[i] = topPairs[i].sentence;
            }
        }

            for (Map.Entry<String, Integer> entry : bigramCounts.entrySet()) {
                bigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalBigrams);
        return topSentences;
    }

            for (Map.Entry<String, Integer> entry : trigramCounts.entrySet()) {
                trigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalTrigrams);
    // Helper class to manage sentences and their scores
    class SentenceScorePair implements Comparable<SentenceScorePair> {
        String sentence;
        float score;

        public SentenceScorePair(String sentence, float score) {
            this.sentence = sentence;
            this.score = score;
        }

        } catch (IOException e) {
            e.printStackTrace();
        @Override
        public int compareTo(SentenceScorePair other) {
            return Float.compare(this.score, other.score);
        }
    }

    public String correct(String input) {
        StringBuilder correctedSentence = new StringBuilder();
        String[] words = input.split("\\s+");
    public static List<String> generateSentences(String[] words) {
        List<String> results = new ArrayList<>();
        boolean[] used = new boolean[words.length];
        backtrack(results, words, new ArrayList<>(), used);
        return results;
    }

    private static void backtrack(List<String> results, String[] words, List<String> current, boolean[] used) {
        if (current.size() >= Math.ceil(words.length * 3.0 / 4.0) && current.size() <= words.length) {
            results.add(String.join(" ", current));
        }

        for (int i = 0; i < words.length - 2; i++) {
            String trigram = words[i].toLowerCase() + " " +
                    words[i + 1].toLowerCase() + " " +
                    words[i + 2].toLowerCase();
        for (int i = 0; i < words.length; i++) {
            if (used[i])
                continue; // Skip used words

            if (!trigramProbabilities.containsKey(trigram)) {
                correctedSentence.append(suggestCorrection(words[i], words[i + 1], words[i + 2])).append(" ");
            } else {
                correctedSentence.append(words[i]).append(" ");
                correctedSentence.append(words[i + 1]).append(" ");
                correctedSentence.append(words[i + 2]).append(" ");
            used[i] = true;
            current.add(words[i]);
            backtrack(results, words, current, used);
            current.remove(current.size() - 1);
            used[i] = false;
        }
    }

        return correctedSentence.toString().trim();
    }
    private static void printSentencesInOrderOfChanges(String[] sentences, String originalSentence) {
        // Order the sentences by the number of changes needed
        Map<String, Integer> changesMap = new HashMap<>();

    private String suggestCorrection(String word1, String word2, String word3) {
        // Trigram, Bigram, and Unigram perplexities
        double trigramPerplexity = calculatePerplexity(trigramProbabilities, word1, word2, word3);
        double bigramPerplexity = calculatePerplexity(bigramProbabilities, word1, word2, "");
        double unigramPerplexity = calculatePerplexity(unigramProbabilities, word1, "", "");
        for (String sentence : sentences) {
            if (sentence == null || sentence.isEmpty())
                continue; // Skip empty sentences (not in the 0.1 percentile of the best sentence)

        if (trigramPerplexity <= bigramPerplexity && trigramPerplexity <= unigramPerplexity) {
            return word1;
        } else if (bigramPerplexity <= unigramPerplexity) {
            return word2;
            int changes = 0;
            if (sentence.length() != originalSentence.length()) {
                changes = Math.abs(sentence.split(" ").length - originalSentence.split(" ").length) + 1;
            } else {
            return word3;
                String[] originalWords = originalSentence.split(" ");
                String[] correctedWords = sentence.split(" ");
                for (int i = 0; i < originalWords.length; i++) {
                    if (!originalWords[i].equals(correctedWords[i])) {
                        changes++;
                    }
                }
            }

    private double calculatePerplexity(Map<String, Double> probabilities, String word1, String word2, String word3) {
        String trigram = word1.toLowerCase() + " " + word2.toLowerCase() + " " + word3;
        double probability = probabilities.getOrDefault(trigram, 0.0);

        // If probability is zero, BACKOFF
        if (probability == 0.0) {
            String bigram = word1.toLowerCase() + " " + word2.toLowerCase();
            probability = probabilities.getOrDefault(bigram, 0.0) * BACKOFF_PENALTY;
            if (probability == 0.0) {
                probability = unigramProbabilities.getOrDefault(word1.toLowerCase(), 0.0) * BACKOFF_PENALTY * BACKOFF_PENALTY;
            changesMap.put(sentence, changes);
        }

        List<Map.Entry<String, Integer>> sortedList = new ArrayList<>(changesMap.entrySet());
        sortedList.sort(Map.Entry.comparingByValue());

        for (Map.Entry<String, Integer> entry : sortedList) {
            System.out.println("    " + entry.getKey() + " | Changes: " + entry.getValue());
        }
        // Perplexity
        return 1.0 / probability;
    }

    public static void main(String[] args) {
        if (args.length > 1 && "--file".equals(args[0])) { // check syntax
        if (args.length > 1 && "--file".equals(args[0])) {
            String path = args[1];
            try {
                String content = new String(Files.readAllBytes(Paths.get(path)));
                Corrector corrector = new Corrector(); // Run corrector
                String corrected = corrector.correct(content);
                System.out.println(corrected);
                String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
                for (String sentence : sentences) {
                    sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
                    String[] corrected = corrector.correct(sentence);
                    System.out.println(sentence + " | Corrected Sentence Suggestions:");
                    printSentencesInOrderOfChanges(corrected, sentence);
                }
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
            }
+21 −42
Original line number Diff line number Diff line
@@ -14,6 +14,9 @@ public class TrieNode implements Serializable, Cloneable {
        TrieNode current = this;
        TrieNode past = this; // Store the previous node
        for (String word : phrase) {
            if (word.length() == 0) {
                continue;
            }
            past = current;
            current = current.children.computeIfAbsent(word, c -> new TrieNode());
        }
@@ -22,66 +25,42 @@ public class TrieNode implements Serializable, Cloneable {
    }

    public float probability(String phrase) {
        String[] words = phrase.split(" ");
        if (words.length <= 1) { // If word does not exist in trie
            // System.out.println("Probability of phrase: " + 1 / this.childCounts);
            return (float) 0.1;
        }
        TrieNode current = this;
        TrieNode past = this;
        for (String word : phrase.split(" ")) {
        // System.out.println("Phrase: " + phrase);
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 1;
                // System.out.println("Phrase not found in trie.");
                return 0;
                return alpha * probability(phrase.substring(phrase.indexOf(" ") + 1));
            }
        }
        // System.out.println("Probability of phrase: " + (double) current.count /
        // past.childCounts);
        // System.out.println("Probability of phrase: " + (float) current.count / past.childCounts);
        return (float) current.count / past.childCounts;
    }

    private float getAverageChildCount() {
        if (this.children.size() == 0) {
            return 1;
        }
        return (float) this.childCounts / this.children.size();
    private float getAverageChildCount(TrieNode node) {
        return (float) node.childCounts / node.children.size();
    }

    public float perplexity(String phrase) {
        TrieNode current = this;
        TrieNode past = this;
        float logProb = 0;
        String[] words = phrase.split(" ");
        if (words.length == 1) {
            return (float) 100 / words.length;
        }
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 100 / words.length;
                return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), words.length);
            }
            logProb += Math.log((float) current.count / past.getAverageChildCount());
        }
        float perplexity = (float) Math.pow(2, -logProb);
        //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
        return perplexity;
    }

    private float perplexity(String phrase, int wordCount) {
        TrieNode current = this;
        TrieNode past = this;
        float logProb = 0;
        String[] words = phrase.split(" ");
        if (words.length == 1) {
            return (float) 100 / wordCount;
        }
        String currentPhrase = "";
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 100 / wordCount;
                return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), wordCount);
            if (currentPhrase.length() == 0) {
                currentPhrase = word;
            } else {
                currentPhrase += " " + word;
            }
            logProb += Math.log((float) current.count / past.getAverageChildCount());
            logProb += Math.log((probability(currentPhrase)));
        }
        float perplexity = (float) Math.pow(2, -logProb);
        //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
+103 −47

File changed.

Preview size limit exceeded, changes collapsed.