Merge branch 'master' into '13-gui' (7c83c077) · Commits · EC504 Spring 2024 Group Projects / Group7

INSTALL.txt

+41 −18

Original line number	Diff line number	Diff line
		The INSTALL.txt file should contain all information needed to install and run your code, from scratch, on a lab machine, including:
		- Pre-conditions
		- Hardware, peripherals, and operating system restrictions for running your code ( i.e., can this code only run on lab machines, or can it run elsewhere?)
		- All code should run on our lab computers, unless you have an exemption from the instructor.
		- If some of your features work more efficiently on specific hardware, please explain this here.

		- Supporting files
		- A list of non-standard libraries needed for your project to run, including:
		- Clear and simple instructions for how to freely (and legally!) acquire and install them from source code with minimal effort.
		- You may additionally link to a binary version of the libraries, if you wish.
		- Examples of how to use your project
		- Several clear examples the illustrate the main features of your project.
		- Descriptions of testing patterns, and instructions on how to exercise them:
		- unit tests
		- system tests

		- Execution
		- Clear and terse instructions on how an average student in the class can compile and run all your code, from scratch, on a lab machine.
		No newline at end of file
		INSTALL.txt for Language Correction Tool

		Pre-conditions
		- Hardware and OS Requirements:
		- The application runs on lab computers equipped with CentOS. It is cross-platform and compatible with any system that supports Java 17.

		- Java Requirement:
		- Java JDK 17 is required.

		Supporting Files
		- External Libraries:
		- Jsoup: For HTML parsing.

		Installation Instructions
		- Install Java JDK 17:
		- Follow the download and installation instructions from Oracle's website (https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html)
		- Set up JAVA_HOME and update your system's PATH.

		- Install Maven:
		- Download and install via Maven (https://maven.apache.org/download.cgi)
		- Follow the detailed installation instructions on the Maven website.
		- Ensure Maven's bin directory is in your system's PATH.

		- Setup Project:
		- Download or clone the project repository.
		- Navigate to the project directory (where pom.xml is located).
		- Run mvn clean install to resolve dependencies and build the project.

		Execution Instructions
		- Launch the application:
		- Open a terminal.
		- Change to the directory containing the project's compiled classes.
		- Run the application using Maven.

		Usage Examples
		- Starting the Application:
		- Execute the application as per the instructions above.
		- Select a module from the drop-down menu (Crawler, Checker, or Corrector).
		- Enter a URL or a local file path in the text field.
		- Click Run to execute the selected module.
		- Have fun!!!

src/main/java/edu/bu/LanguageCorrection/Corrector.java

+148 −90

Original line number	Diff line number	Diff line
		package edu.bu.LanguageCorrection;
		import java.io.BufferedReader;
		import java.io.FileReader;

		import java.io.IOException;
		import java.nio.file.Files;
		import java.nio.file.Paths;
		import java.util.HashMap;
		import java.util.zip.Inflater;
		import java.io.ByteArrayOutputStream;
		import java.io.FileInputStream;
		import java.util.List;
		import java.util.Map;
		import java.util.ArrayList;
		import java.util.PriorityQueue;
		import java.util.Collections;
		import java.util.HashMap;

		public class Corrector {
		private Map<String, Double> trigramProbabilities;
		private Map<String, Double> bigramProbabilities;
		private Map<String, Double> unigramProbabilities;

		private static final double BACKOFF_PENALTY = 0.1;
		public class Corrector {
		private TrieNode detector;

		public Corrector() {
		trigramProbabilities = new HashMap<>();
		bigramProbabilities = new HashMap<>();
		unigramProbabilities = new HashMap<>();
		loadBrown();
		detector = loadFile("metadata.ser");
		}

		private void loadBrown() {
		try (BufferedReader br = new BufferedReader(new FileReader("Checker/brown.txt"))) {
		String line;
		Map<String, Integer> bigramCounts = new HashMap<>();
		Map<String, Integer> trigramCounts = new HashMap<>();
		Map<String, Integer> unigramCounts = new HashMap<>();

		while ((line = br.readLine()) != null) {
		String[] words = line.split("\\s+");
		for (String word : words) {
		String lowerCaseWord = word.toLowerCase();
		unigramCounts.put(lowerCaseWord, unigramCounts.getOrDefault(lowerCaseWord, 0) + 1);
		private TrieNode loadFile(String filePath) {
		TrieNode trie = new TrieNode();
		try (FileInputStream fis = new FileInputStream(filePath)) {
		byte[] compressedData = fis.readAllBytes();
		byte[] decompressedData = decompress(compressedData);
		trie.deserialize(decompressedData);
		System.out.println("Metadata loaded successfully.");
		return trie;
		} catch (IOException e) {
		System.err.println("Error reading metadata from file: " + e.getMessage());
		return new TrieNode();
		}
		}

		private static byte[] decompress(byte[] compressedData) {
		Inflater decompressor = new Inflater();
		decompressor.setInput(compressedData);

		if (words.length < 3) continue; // Skip lines with less than 3 words
		ByteArrayOutputStream bos = new ByteArrayOutputStream(compressedData.length);

		for (int i = 0; i < words.length - 1; i++) {
		String bigram = words[i].toLowerCase() + " " + words[i + 1].toLowerCase();
		bigramCounts.put(bigram, bigramCounts.getOrDefault(bigram, 0) + 1);
		byte[] buf = new byte[1024];
		try {
		while (!decompressor.finished()) {
		int count = decompressor.inflate(buf);
		bos.write(buf, 0, count);
		}
		decompressor.end();
		return bos.toByteArray();
		} catch (Exception e) {
		System.err.println("Error decompressing data: " + e.getMessage());
		return new byte[0];
		}
		}

		public String[] correct(String inputSentence) {
		// Divide sentence into words
		String[] words = inputSentence.split(" ");

		List<String> sentences = generateSentences(words);

		// Use a priority queue to store sentences along with their scores
		PriorityQueue<SentenceScorePair> pq = new PriorityQueue<>(
		Collections.reverseOrder());

		for (String sentence : sentences) {
		float score = detector.perplexity(sentence);
		// System.out.println(sentence + " \| Score: " + score);

		pq.offer(new SentenceScorePair(sentence, score));

		for (int i = 0; i < words.length - 2; i++) {
		String trigram = words[i].toLowerCase() + " " +
		words[i + 1].toLowerCase() + " " +
		words[i + 2].toLowerCase();
		trigramCounts.put(trigram, trigramCounts.getOrDefault(trigram, 0) + 1);
		// Ensure only the top 5 sentences are kept, remove the worst if more than 5
		if (pq.size() > 5) {
		pq.poll();
		}
		}

		int totalUnigrams = unigramCounts.values().stream().mapToInt(Integer::intValue).sum();
		int totalBigrams = bigramCounts.values().stream().mapToInt(Integer::intValue).sum();
		int totalTrigrams = trigramCounts.values().stream().mapToInt(Integer::intValue).sum();
		// Print the top sentences with their scores
		SentenceScorePair[] topPairs = new SentenceScorePair[pq.size()];
		int index = pq.size() - 1;
		while (!pq.isEmpty()) {
		topPairs[index] = pq.poll();
		index--;
		}

		// Print sentences in the right order
		// for (SentenceScorePair pair : topPairs) {
		// System.out.println(pair.sentence + " \| Score: " + pair.score);
		// }

		for (Map.Entry<String, Integer> entry : unigramCounts.entrySet()) {
		unigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalUnigrams);
		// Convert to array of just sentences
		String[] topSentences = new String[topPairs.length];
		for (int i = 0; i < topPairs.length; i++) {
		// Remove sentence that are not in the 0.1 percentile of the best sentence
		if (topPairs[i].score > topPairs[0].score * 1.5) {
		topSentences[i] = "";
		} else {
		topSentences[i] = topPairs[i].sentence;
		}
		}

		for (Map.Entry<String, Integer> entry : bigramCounts.entrySet()) {
		bigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalBigrams);
		return topSentences;
		}

		for (Map.Entry<String, Integer> entry : trigramCounts.entrySet()) {
		trigramProbabilities.put(entry.getKey(), (double) entry.getValue() / totalTrigrams);
		// Helper class to manage sentences and their scores
		class SentenceScorePair implements Comparable<SentenceScorePair> {
		String sentence;
		float score;

		public SentenceScorePair(String sentence, float score) {
		this.sentence = sentence;
		this.score = score;
		}

		} catch (IOException e) {
		e.printStackTrace();
		@Override
		public int compareTo(SentenceScorePair other) {
		return Float.compare(this.score, other.score);
		}
		}

		public String correct(String input) {
		StringBuilder correctedSentence = new StringBuilder();
		String[] words = input.split("\\s+");
		public static List<String> generateSentences(String[] words) {
		List<String> results = new ArrayList<>();
		boolean[] used = new boolean[words.length];
		backtrack(results, words, new ArrayList<>(), used);
		return results;
		}

		private static void backtrack(List<String> results, String[] words, List<String> current, boolean[] used) {
		if (current.size() >= Math.ceil(words.length * 3.0 / 4.0) && current.size() <= words.length) {
		results.add(String.join(" ", current));
		}

		for (int i = 0; i < words.length - 2; i++) {
		String trigram = words[i].toLowerCase() + " " +
		words[i + 1].toLowerCase() + " " +
		words[i + 2].toLowerCase();
		for (int i = 0; i < words.length; i++) {
		if (used[i])
		continue; // Skip used words

		if (!trigramProbabilities.containsKey(trigram)) {
		correctedSentence.append(suggestCorrection(words[i], words[i + 1], words[i + 2])).append(" ");
		} else {
		correctedSentence.append(words[i]).append(" ");
		correctedSentence.append(words[i + 1]).append(" ");
		correctedSentence.append(words[i + 2]).append(" ");
		used[i] = true;
		current.add(words[i]);
		backtrack(results, words, current, used);
		current.remove(current.size() - 1);
		used[i] = false;
		}
		}

		return correctedSentence.toString().trim();
		}
		private static void printSentencesInOrderOfChanges(String[] sentences, String originalSentence) {
		// Order the sentences by the number of changes needed
		Map<String, Integer> changesMap = new HashMap<>();

		private String suggestCorrection(String word1, String word2, String word3) {
		// Trigram, Bigram, and Unigram perplexities
		double trigramPerplexity = calculatePerplexity(trigramProbabilities, word1, word2, word3);
		double bigramPerplexity = calculatePerplexity(bigramProbabilities, word1, word2, "");
		double unigramPerplexity = calculatePerplexity(unigramProbabilities, word1, "", "");
		for (String sentence : sentences) {
		if (sentence == null \|\| sentence.isEmpty())
		continue; // Skip empty sentences (not in the 0.1 percentile of the best sentence)

		if (trigramPerplexity <= bigramPerplexity && trigramPerplexity <= unigramPerplexity) {
		return word1;
		} else if (bigramPerplexity <= unigramPerplexity) {
		return word2;
		int changes = 0;
		if (sentence.length() != originalSentence.length()) {
		changes = Math.abs(sentence.split(" ").length - originalSentence.split(" ").length) + 1;
		} else {
		return word3;
		String[] originalWords = originalSentence.split(" ");
		String[] correctedWords = sentence.split(" ");
		for (int i = 0; i < originalWords.length; i++) {
		if (!originalWords[i].equals(correctedWords[i])) {
		changes++;
		}
		}
		}

		private double calculatePerplexity(Map<String, Double> probabilities, String word1, String word2, String word3) {
		String trigram = word1.toLowerCase() + " " + word2.toLowerCase() + " " + word3;
		double probability = probabilities.getOrDefault(trigram, 0.0);

		// If probability is zero, BACKOFF
		if (probability == 0.0) {
		String bigram = word1.toLowerCase() + " " + word2.toLowerCase();
		probability = probabilities.getOrDefault(bigram, 0.0) * BACKOFF_PENALTY;
		if (probability == 0.0) {
		probability = unigramProbabilities.getOrDefault(word1.toLowerCase(), 0.0) * BACKOFF_PENALTY * BACKOFF_PENALTY;
		changesMap.put(sentence, changes);
		}

		List<Map.Entry<String, Integer>> sortedList = new ArrayList<>(changesMap.entrySet());
		sortedList.sort(Map.Entry.comparingByValue());

		for (Map.Entry<String, Integer> entry : sortedList) {
		System.out.println(" " + entry.getKey() + " \| Changes: " + entry.getValue());
		}
		// Perplexity
		return 1.0 / probability;
		}

		public static void main(String[] args) {
		if (args.length > 1 && "--file".equals(args[0])) { // check syntax
		if (args.length > 1 && "--file".equals(args[0])) {
		String path = args[1];
		try {
		String content = new String(Files.readAllBytes(Paths.get(path)));
		Corrector corrector = new Corrector(); // Run corrector
		String corrected = corrector.correct(content);
		System.out.println(corrected);
		String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
		for (String sentence : sentences) {
		sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
		String[] corrected = corrector.correct(sentence);
		System.out.println(sentence + " \| Corrected Sentence Suggestions:");
		printSentencesInOrderOfChanges(corrected, sentence);
		}
		} catch (IOException e) {
		System.err.println("Error reading file: " + e.getMessage());
		}

src/main/java/edu/bu/LanguageCorrection/TrieNode.java

+21 −42

Original line number	Diff line number	Diff line
		@@ -14,6 +14,9 @@ public class TrieNode implements Serializable, Cloneable {
		TrieNode current = this;
		TrieNode past = this; // Store the previous node
		for (String word : phrase) {
		if (word.length() == 0) {
		continue;
		}
		past = current;
		current = current.children.computeIfAbsent(word, c -> new TrieNode());
		}
		@@ -22,66 +25,42 @@ public class TrieNode implements Serializable, Cloneable {
		}

		public float probability(String phrase) {
		String[] words = phrase.split(" ");
		if (words.length <= 1) { // If word does not exist in trie
		// System.out.println("Probability of phrase: " + 1 / this.childCounts);
		return (float) 0.1;
		}
		TrieNode current = this;
		TrieNode past = this;
		for (String word : phrase.split(" ")) {
		// System.out.println("Phrase: " + phrase);
		for (String word : words) {
		past = current;
		current = current.children.get(word);
		if (current == null) {
		float alpha = (float) 1;
		// System.out.println("Phrase not found in trie.");
		return 0;
		return alpha * probability(phrase.substring(phrase.indexOf(" ") + 1));
		}
		}
		// System.out.println("Probability of phrase: " + (double) current.count /
		// past.childCounts);
		// System.out.println("Probability of phrase: " + (float) current.count / past.childCounts);
		return (float) current.count / past.childCounts;
		}

		private float getAverageChildCount() {
		if (this.children.size() == 0) {
		return 1;
		}
		return (float) this.childCounts / this.children.size();
		private float getAverageChildCount(TrieNode node) {
		return (float) node.childCounts / node.children.size();
		}

		public float perplexity(String phrase) {
		TrieNode current = this;
		TrieNode past = this;
		float logProb = 0;
		String[] words = phrase.split(" ");
		if (words.length == 1) {
		return (float) 100 / words.length;
		}
		for (String word : words) {
		past = current;
		current = current.children.get(word);
		if (current == null) {
		float alpha = (float) 100 / words.length;
		return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), words.length);
		}
		logProb += Math.log((float) current.count / past.getAverageChildCount());
		}
		float perplexity = (float) Math.pow(2, -logProb);
		//System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
		return perplexity;
		}

		private float perplexity(String phrase, int wordCount) {
		TrieNode current = this;
		TrieNode past = this;
		float logProb = 0;
		String[] words = phrase.split(" ");
		if (words.length == 1) {
		return (float) 100 / wordCount;
		}
		String currentPhrase = "";
		for (String word : words) {
		past = current;
		current = current.children.get(word);
		if (current == null) {
		float alpha = (float) 100 / wordCount;
		return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), wordCount);
		if (currentPhrase.length() == 0) {
		currentPhrase = word;
		} else {
		currentPhrase += " " + word;
		}
		logProb += Math.log((float) current.count / past.getAverageChildCount());
		logProb += Math.log((probability(currentPhrase)));
		}
		float perplexity = (float) Math.pow(2, -logProb);
		//System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);

src/main/java/edu/bu/LanguageCorrection/crawler.java

+103 −47

File changed.

Preview size limit exceeded, changes collapsed.