Commit 4c5e4f1c authored by Moises Bensadon's avatar Moises Bensadon
Browse files

Merge branch '36-set-all-text-to-lowercase-when-scraping' into 'master'

Resolve "Set all text to lowercase when scraping"

Closes #36

See merge request ec504/ec504_projects/group7!21
parents 51c77260 562ead7a
Loading
Loading
Loading
Loading
+43 −20
Original line number Diff line number Diff line
@@ -10,15 +10,22 @@ import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.PriorityQueue;
import java.util.function.Consumer;
import java.util.Collections;
import java.util.HashMap;


public class Corrector {
    private TrieNode detector;
    private static Consumer<String> outputCallback;

    public Corrector(String metadataPath) {
        detector = loadFile(metadataPath);
        outputCallback = System.out::println;
    }

    public void setCallback(Consumer<String> callback) {
        outputCallback = callback;
    }

    public TrieNode getDetector() {
@@ -174,37 +181,52 @@ public class Corrector {
        }
    }

    private static void printSentencesInOrderOfChanges(String[] sentences, String originalSentence) {
    private static int longestCommonSubsequence(String[] originalWords, String[] shuffledWords) {
        int n = originalWords.length;
        int m = shuffledWords.length;
        int[][] dp = new int[n + 1][m + 1];

        for (int i = 1; i <= n; i++) {
            for (int j = 1; j <= m; j++) {
                if (originalWords[i - 1].equals(shuffledWords[j - 1])) {
                    dp[i][j] = dp[i - 1][j - 1] + 1;
                } else {
                    dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
                }
            }
        }
        return dp[n][m];
    }

    public void printSentencesInOrderOfChanges(String[] sentences, String originalSentence) {
        // Order the sentences by the number of changes needed
        Map<String, Integer> changesMap = new HashMap<>();

        String[] originalWords = originalSentence.split(" ");

        for (String sentence : sentences) {
            if (sentence == null || sentence.isEmpty())
                continue; // Skip empty sentences (not in the 0.5 percentile of the best sentence)
            if (sentence == null || sentence.isEmpty()) {
                continue;
            }

            String sentencePart = sentence.split(" | Score: ")[0]; // Remove score (if present
            String sentencePart = sentence.split(" \\| Score: ")[0]; // Remove score, if present
            String[] shuffledWords = sentencePart.split(" ");

            int changes = 0;
            if (sentencePart.length() != originalSentence.length()) {
                changes = Math.abs(sentencePart.split(" ").length - originalSentence.split(" ").length) + 1;
            } else {
                String[] originalWords = originalSentence.split(" ");
                String[] correctedWords = sentencePart.split(" ");
                for (int i = 0; i < originalWords.length; i++) {
                    if (!originalWords[i].equals(correctedWords[i])) {
                        changes++;
                    }
                }
            }
            // Calculate the longest common subsequence
            int lcsLength = longestCommonSubsequence(originalWords, shuffledWords);

            // Number of changes needed is the difference in length minus LCS length
            int changes = (originalWords.length - lcsLength) + (shuffledWords.length - lcsLength);

            changesMap.put(sentence, changes);
        }

        // Sort by number of changes in ascending order
        List<Map.Entry<String, Integer>> sortedList = new ArrayList<>(changesMap.entrySet());
        sortedList.sort(Map.Entry.comparingByValue());

        for (Map.Entry<String, Integer> entry : sortedList) {
            System.out.println(">> " + entry.getKey() + " | Changes: " + entry.getValue());
            outputCallback.accept(">> " + entry.getKey() + " | Changes: " + entry.getValue()+"\n");
        }
    }

@@ -215,16 +237,17 @@ public class Corrector {
            try {
                String content = new String(Files.readAllBytes(Paths.get(path)));
                Corrector corrector = new Corrector(metadataPath); // Run corrector
                corrector.setCallback(System.out::println);
                String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
                for (String sentence : sentences) {
                    sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
                    String[] corrected = corrector.correct(sentence);
                    if (corrected.length == 0) {
                        System.out.println(sentence + " | No corrections needed.");
                        outputCallback.accept(sentence + " | No corrections needed.\n");
                        continue;
                    }
                    System.out.println(sentence + " | Corrected Sentence Suggestions:");
                    printSentencesInOrderOfChanges(corrected, sentence);
                    outputCallback.accept(sentence + " | Corrected Sentence Suggestions:");
                    corrector.printSentencesInOrderOfChanges(corrected, sentence);
                }
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
+19 −44
Original line number Diff line number Diff line
@@ -9,9 +9,17 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;

import javax.swing.text.*;

import scala.collection.mutable.StringBuilder;

import java.awt.Color;
import java.awt.BorderLayout;
import java.util.zip.Deflater;
@@ -165,7 +173,7 @@ public class MainApp extends JFrame {
            if (isFile) {
                content = new String(Files.readAllBytes(Paths.get(text)));
            } else {
                content = text;
                content = text.toLowerCase();
            }

            // Assume content is already properly split into sentences here
@@ -259,72 +267,39 @@ public class MainApp extends JFrame {
        }
    }

    private void printSentencesInOrderOfChanges(String[] sentences, String originalSentence, StringBuilder result) {
        // Order the sentences by the number of changes needed
        Map<String, Integer> changesMap = new HashMap<>();

        for (String sentence : sentences) {
            if (sentence == null || sentence.isEmpty())
                continue; // Skip empty sentences (not in the 0.5 percentile of the best sentence)

            String sentencePart = sentence.split(" | Score: ")[0]; // Remove score (if present

            int changes = 0;
            if (sentencePart.length() != originalSentence.length()) {
                changes = Math.abs(sentencePart.split(" ").length - originalSentence.split(" ").length) + 1;
            } else {
                String[] originalWords = originalSentence.split(" ");
                String[] correctedWords = sentencePart.split(" ");
                for (int i = 0; i < originalWords.length; i++) {
                    if (!originalWords[i].equals(correctedWords[i])) {
                        changes++;
                    }
                }
            }

            changesMap.put(sentence, changes);
        }

        List<Map.Entry<String, Integer>> sortedList = new ArrayList<>(changesMap.entrySet());
        sortedList.sort(Map.Entry.comparingByValue());

        int i = 1;
        for (Map.Entry<String, Integer> entry : sortedList) {
            result.append(">> " + entry.getKey() + " | Changes: " + entry.getValue() + "\n");
            i++;
        }
    }

    private void runCorrector(String input, boolean isFile) {
        try {
            String content;
            if (isFile) 
                content= new String(Files.readAllBytes(Paths.get(input)));
            else
                content = input;
                content = input.toLowerCase();
            Corrector corrector = new Corrector(languageFile);
            Consumer<String> outputCallback = output -> {
                resultArea.append(output);
                resultArea.setCaretPosition(resultArea.getDocument().getLength());
            };
            corrector.setCallback(outputCallback);
            String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
            progressBar.setValue(0);
            progressBar.setStringPainted(true);
            progressBar.setString(null);
            progressBar.setMaximum(sentences.length*2);

            StringBuilder result = new StringBuilder();
            for (String sentence : sentences) {
                progressBar.setValue(progressBar.getValue() + 1);
                sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
                String[] corrected = corrector.correct(sentence);
                progressBar.setValue(progressBar.getValue() + 2);
                if (corrected.length == 0) {
                    result.append(sentence + " | No corrections found for this sentence.\n\n");
                    outputCallback.accept(sentence + " | No corrections found for this sentence.\n\n");
                    continue;
                }
                result.append(sentence + " | Corrected Sentence Suggestions:\n");
                printSentencesInOrderOfChanges(corrected, sentence, result);
                result.append("\n");
                outputCallback.accept(sentence + " | Corrected Sentence Suggestions:\n");
                corrector.printSentencesInOrderOfChanges(corrected, sentence);
            }

            resultArea.setText(result.toString());
            // resultArea.setText(result.toString());
            // ISSUE #30 - Feedback for corrector
            // Add a pop up to input the best correction for each sentence
            TrieNode node = corrector.getDetector();
+1 −1
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ public class TrieNode implements Serializable, Cloneable {
                continue;
            }
            past = current;
            current = current.children.computeIfAbsent(word, c -> new TrieNode());
            current = current.children.computeIfAbsent(word.toLowerCase(), c -> new TrieNode());
        }
        current.count += 1;
        past.childCounts += 1;
+7 −18
Original line number Diff line number Diff line
@@ -51,10 +51,12 @@ administrators and the awarding of fees and compensation.

This is a strange choice of word. Kenya was a British colony, and the
British government was responsible for the welfare of the people of
Kenya. The British government was not a "regime" in Kenya. The
Mau Mau were a group of rebels who were fighting against the British
government, and they were responsible for the deaths of many innocent
people. The British government put down the rebellion, and they put
Kenya. Hello how are you doing today. Hello there. Hey there. 
Hello how are you doing today. Hello there. Hey there.
Hello how are you doing today. Hello there. Hey there.
Hello how are you doing today. Hello there. Hey there.
Hello how are you doing today. Hello there. Hey there.
The British government put down the rebellion, and they put
the leaders of the rebellion on trial. Kenya, officially the Republic of Kenya,
This is an odd choice of word order. Sorry for the inconvenience. 
Please wait until we have finished the translation. This is just a random list
@@ -48895,20 +48897,7 @@ and tremendously wise in his counsel. In conclusion, he wished me well-
and as kindly and humbly as this humane gentleman could express 
himself, he asked to be remembered to my wife and children.   In 
my short period here I believe that at no time has he been otherwise 
than the most popular man on both sides of the aisle. He is most effective 
in the ordinary business of the House, and in the legislative 
accomplishments of this session, he easily rose to great occasion- even 
at the height of unpleasantness and exciting legislative struggle- 
and as the Nation witnessed these contests, he rose, even as admitted 
by those who differed with him, to the proportions of a hero and a 
noble partisan.   I am highly privileged today to commemorate the 
brilliant career of this parliamentary giant. He will ever be my example 
as a true statesman; one who is thoroughly human, who affects 
no dignity, and who is endowed with real ability, genuine worth, and 
sterling honesty- all dedicated to secure the best interests of the 
country he has loved and served so long. May the Divine Speaker in 
Heaven bless this country with SAM RAYBURN'S continued service 
here for years to come. 
than the most popular man on both sides of the aisle. 

#REMARKS OF HON& WAYNE L& HAYS OF OHIO#