Commit 3c7a69e1 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Merge branch '13-gui' into 'master'

GUI

Closes #13

See merge request ec504/ec504_projects/group7!8
parents a18c93ca 7c83c077
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
This is unambiguous. This is word odd so choice. Sorry for the inconvenience.
Kenya, officially the Republic of Kenya
 No newline at end of file
Kenya, officially the Republic of Kenya.
 No newline at end of file
+6 −0
Original line number Diff line number Diff line
@@ -20,6 +20,12 @@
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version>
        </dependency>
    
      <dependency>
        <groupId>de.sciss</groupId>
        <artifactId>dotterweide-ui_2.12</artifactId>
        <version>0.4.3</version>
      </dependency>
    </dependencies>

</project>
 No newline at end of file
+2 −2
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
@@ -9,7 +10,6 @@ import java.util.zip.Inflater;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;


public class Checker {
    public void analyze(String text) {
        List<String> sentences = TextProcessor.extractSentences(text);
+240 −0
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;

import javax.sql.rowset.spi.SyncFactory;
import javax.swing.*;
import javax.swing.text.Highlighter;
import javax.swing.text.Highlighter.Highlight;

import dotterweide.editor.painter.HighlightPainter;

import java.awt.*;
// import java.awt.event.ActionEvent;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import javax.swing.*;
import javax.swing.text.*;
import java.awt.Color;
import java.awt.BorderLayout;
import java.awt.event.ActionEvent;

public class MainApp extends JFrame {
    private final JTextField urlField;
    private JTextArea resultArea;
    private Highlighter.HighlightPainter myHighlightPainter;
    private final JButton runButton;
    private final JComboBox<String> moduleSelector;

    public MainApp() {
        super("Language Correction Tool");
        setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
        setSize(800, 600);
        setLocationRelativeTo(null);
        setLayout(new BorderLayout());

        // Check if metadata file exists
        if (!Files.exists(Paths.get("metadata.ser"))) {
            String[] languages = {"en", "es", "pt", "it"};
            String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of. \n(If you want to build from scratch just click Cancel)", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
            if (selectedLanguage != null) {
                crawler webCrawler = new crawler();
                webCrawler.build(selectedLanguage);
            }
        }

        // Module selector
        String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "File Checker", "File Corrector", "Text Checker", "Text Corrector"};
        moduleSelector = new JComboBox<>(modules);

        // User inout entry field and run button
        urlField = new JTextField();
        runButton = new JButton("Run");
        runButton.addActionListener(e -> {
            String selectedModule = (String) moduleSelector.getSelectedItem();
            String input = urlField.getText().trim();

            switch (selectedModule) {
                case "Web Crawler":
                    runCrawler(input);
                    break;
                case "Reddit Crawler":
                    runCrawler("https://www.reddit.com/r/"+input);
                    break;
                case "File Checker":
                    runChecker(input,true);
                    break;
                case "File Corrector":
                    runCorrector(input,true);
                    break;
                case "Text Checker":
                    runChecker(input,false);
                    break;
                case "Text Corrector":
                    runCorrector(input,false);
                    break;
                default:
                    JOptionPane.showMessageDialog(this, "Select a valid module");
            }
        });
        JPanel northPanel = new JPanel(new BorderLayout());
        northPanel.add(moduleSelector, BorderLayout.WEST);
        northPanel.add(urlField, BorderLayout.CENTER);
        northPanel.add(runButton, BorderLayout.EAST);

        // Result area
        resultArea = new JTextArea();
        resultArea.setEditable(false);
        JScrollPane scrollPane = new JScrollPane(resultArea);

        add(northPanel, BorderLayout.NORTH);
        add(scrollPane, BorderLayout.CENTER);
    }

    private void runCrawler(String input) {
        StringBuilder outputBuilder = new StringBuilder();
        crawler webCrawler = new crawler(output -> {
            outputBuilder.append(output);
            SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
        });
        if (input.startsWith("http")) {
            // Input is a single URL
            webCrawler.add_to_queue(input);
        } else {
            // Input is file path
            try {
                List<String> lines = Files.readAllLines(Paths.get(input));
                lines.forEach(webCrawler::add_to_queue);
            } catch (Exception e) {
                resultArea.setText("Error reading file: " + e.getMessage());
                return;
            }
        }
        // Create a new thread for running the crawler
        Thread crawlerThread = new Thread(() -> {
            webCrawler.crawl(5); // Adjust the limit (DEFAULT SET TO 1)
        });
        // Start the crawler thread
        crawlerThread.start();
    }

    private void runChecker(String text, boolean isFile) {
        try {
            String content;
            if (isFile) {
                content = new String(Files.readAllBytes(Paths.get(text)));
            } else {
                content = text;
            }

            // Assume content is already properly split into sentences here
            List<String> sentences = TextProcessor.extractSentences(content); // Use a method to split into sentences

            Checker checker = new Checker();
            StringBuilder result = new StringBuilder();

            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
            PrintStream printStream = new PrintStream(outputStream);
            PrintStream originalOut = System.out;

            String[] worstPhrases = new String[sentences.size()];

            for (String sentence : sentences) {
                System.setOut(printStream);
                checker.analyze(sentence); // Analyze each sentence separately

                // Reset System.out
                System.out.flush();
                System.setOut(originalOut);

                // Capture the output into a string
                String output = outputStream.toString();
                outputStream.reset(); // Clear the output stream for the next sentence
                //System.out.println(output);

                // Parsing the output to get phrases and their scores
                String[] lines = output.split("\n");
                double lowestScore = Double.MAX_VALUE;
                String worstPhrase = null;

                for (String line : lines) {
                    if (line.startsWith("\"phrases\":")) {
                        //System.out.println(line);
                        line = line.replace("\"phrases\":", "").replace("{", "").replace("}", "").trim();
                        //System.out.println(line);
                        String[] phrases = line.split(",");
                        //System.out.println(line);
                        for (String phrase : phrases) {
                            String[] parts = phrase.trim().split(":");
                            double phraseScore = parts[1].trim().equals("null") ? 0
                                    : Double.parseDouble(parts[1].trim());
                            if (phraseScore < lowestScore) {
                                lowestScore = phraseScore;
                                worstPhrase = parts[0].trim();
                                worstPhrases[sentences.indexOf(sentence)] = worstPhrase;
                            }
                        }
                    }
                }

                if (worstPhrase != null) {
                    // Append to the result with annotations
                    result.append("\nSentence: ").append(sentence)
                            .append("\n>> Worst Phrase: ").append(worstPhrase)
                            .append(" (Score: ").append(lowestScore).append(")\n");
                }
            }

            resultArea.setText(result.toString()); // Display the annotated results in the JTextArea

            // Highlight the worst phrase in each sentence
            Highlighter highlighter = resultArea.getHighlighter();
            Highlighter.HighlightPainter painter = new DefaultHighlighter.DefaultHighlightPainter(Color.YELLOW);
            for (String phrase : worstPhrases) {
                phrase = phrase.replaceAll("\"", "");
                //System.out.println(phrase);
                if (phrase != null) {
                    //System.out.println(resultArea.getText());
                    int start = resultArea.getText().indexOf(phrase);
                    if (start != -1) {
                        int end = start + phrase.length();
                        try {
                            highlighter.addHighlight(start, end, painter);
                        } catch (BadLocationException e) {
                            e.printStackTrace();
                        }
                    } else {
                        System.out.println("Text not found");
                    }
                }
            }
        } catch (Exception e) {
            //throw new RuntimeException(e);
            resultArea.setText("Error: " + e.getMessage());
        }
    }

    private void runCorrector(String input, boolean isFile) {
        try {
            String content;
            if (isFile) 
                content= new String(Files.readAllBytes(Paths.get(input)));
            else
                content = input;
            Corrector corrector = new Corrector();
            String corrected = corrector.correct(content);
            resultArea.setText("Corrected Text:\n" + corrected);
        } catch (Exception e) {
            resultArea.setText("Error: " + e.getMessage());
        }
    }

    public static void main(String[] args) {
        SwingUtilities.invokeLater(() -> {
            MainApp mainApp = new MainApp();
            mainApp.setVisible(true);
        });
    }
}
+109 −53
Original line number Diff line number Diff line
@@ -16,10 +16,13 @@ import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.util.function.Consumer;

import java.lang.Math;


public class crawler {

    public static void main(String[] args) throws IOException {
@@ -34,15 +37,17 @@ public class crawler {
            } else if (args[i].equals("--debug")) {
                web_crawler.debug = true;
            } else if (args[i].equals("--build")) {
                web_crawler.build_off_corpus = true;
                System.out.println("Building off-corpus...");
                web_crawler.build(args[i + 1]);
            } else if (args[i].equals("--social")) {
                web_crawler.is_username = true;
                file_url = args[i + 1]; // Reuse variable to hold username
            }
        }
        if (!web_crawler.build_off_corpus) {
        if (!file_url.isEmpty()) {
            if (web_crawler.is_username) {
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user
                                                                                   // page
                // System.out.println(file_url);
                web_crawler.add_to_queue(user_url);
            } else {
@@ -66,6 +71,7 @@ public class crawler {
        // Print visited URLs
        web_crawler.get_visited();
    }

    // public void get_queue(){
    // for (String url : url_queue) {
    // System.out.println(url);
@@ -81,6 +87,7 @@ public class crawler {
    private boolean build_off_corpus = false;
    private boolean is_username = false; // flag for provided (reddit) username
    private static final int MAXNGRAM = 3;
    private Consumer<String> outputCallback;

    public crawler() {
        url_queue = new LinkedList<>();
@@ -88,20 +95,26 @@ public class crawler {

        // Load trie from file
        wordUsage = loadFile(filePath);
        this.outputCallback = System.out::println;

        // Estimate page count based on compressed file size
    }

    public crawler(Consumer<String> outputCallback) {
        this.outputCallback = outputCallback;
        url_queue = new LinkedList<>();
        visited_urls = new HashSet<>();

        // Load trie from file
        wordUsage = loadFile(filePath);
    }

    public void add_to_queue(String url) {
        url_queue.add(url);
    }

    public void crawl(int maxPages) {
        int pageCount = 0;
        if (build_off_corpus) {
            System.out.println("Building off corpus...");
            processPage(get_file_text("brown.txt"), false); // TODO: extend to multiple files i.e. one per language
        }
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();

@@ -114,6 +127,7 @@ public class crawler {
            }
            try {
                System.out.println("Processing: " + cur_site);
                outputCallback.accept("\nProcessing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data, isPost);
@@ -125,12 +139,35 @@ public class crawler {
            }
        }
        System.out.println("Total pages visited: " + pageCount);
        outputCallback.accept("\nTotal pages visited: " + pageCount);
    }

    public void build(String language) {
        String corpus = "";
        if (language.equals("en")) {
            corpus = "brown.txt";
        } else if (language.equals("es")) {
            corpus = "es.txt";
        } else if (language.equals("it")) {
            corpus = "it.txt";
        } else if (language.equals("pt")) {
            corpus = "pt.txt";
        } else {
            System.err.println("Unsupported language: " + language);
            return;
        }

        build_off_corpus = true;

        processPage(get_file_text(corpus), false);
    }

    private Document get_web_data(String url) throws IOException {
        //use execute() in order to receive a response object -> allows status code checking
        // use execute() in order to receive a response object -> allows status code
        // checking
        Connection.Response req_response = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
                .userAgent(
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
                .referrer("http://www.google.com")
                .execute();

@@ -143,7 +180,8 @@ public class crawler {
    }

    private Document get_file_text(String filename) {
        // takes filename as input, reads it from src/main/resources and returns the text as a Document object
        // takes filename as input, reads it from src/main/resources and returns the
        // text as a Document object
        try {
            Document doc = new Document("");
            doc.title(filename);
@@ -154,6 +192,7 @@ public class crawler {
                    doc.append(line);
                }
                System.out.println("File read successfully.");
                outputCallback.accept("\nFile read successfully.");
            }
            return doc;
        } catch (IOException e) {
@@ -198,7 +237,8 @@ public class crawler {
        // Ignore for now - optimizes body + comment content retrieval
        // if(isPost) {
        // Get proper tags + queries
//            Element block = web_data.selectFirst("shreddit-post.block.xs:mt-xs.xs:-mx-xs.xs:px-xs.xs:rounded-[16px].pt-xs.nd:pt-xs.bg-[color:var(--shreddit-content-background)].box-border.mb-xs.nd:visible.nd:pb-2xl");
        // Element block =
        // web_data.selectFirst("shreddit-post.block.xs:mt-xs.xs:-mx-xs.xs:px-xs.xs:rounded-[16px].pt-xs.nd:pt-xs.bg-[color:var(--shreddit-content-background)].box-border.mb-xs.nd:visible.nd:pb-2xl");
        // String body = block.select("div.text-neutral-content.text-body").text();
        // System.out.println(body);
        // }
@@ -208,6 +248,7 @@ public class crawler {
            List<String> chunks = splitTextIntoChunks(web_data.text());
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                outputCallback.accept("\nNo text found on page.");
                return;
            }
            // System.out.println(chunks.size());
@@ -215,14 +256,21 @@ public class crawler {
            for (String chunk : chunks) {
                // chunkCount++;
                // System.out.println(chunk);
                // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
                // System.out.println("Current compressed size: "+ compressedData.length+".
                // Processing chunk " + chunkCount + " of " + chunks.size());
                previousUncompressedData = uncompressedData.clone();
                extractWordUsage(chunk, wordUsage);
                uncompressedData = wordUsage.serialize();
                compressedData = compress(uncompressedData);
                if ((compressedData.length - compression_size > 1024)) {
                    System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Previous compressed data size: " + compression_size
                            + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
                            + (compressedData.length - compression_size) + " bytes.");
                    outputCallback.accept("\nPrevious compressed data size: " + compression_size
                            + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
                            + (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Size limit exceeded. Reverting to previous chunk.");
                    outputCallback.accept("\nSize limit exceeded. Reverting to previous chunk.");
                    sizeLimitExceeded = true;
                    uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
                    compressedData = compress(uncompressedData); // Recompress the reverted state
@@ -247,21 +295,27 @@ public class crawler {
        writeToFile(compressedData, filePath);
        if (!sizeLimitExceeded) {
            System.out.println("Compressed tree exported successfully to: " + filePath);
            outputCallback.accept("\nCompressed tree exported successfully to: " + filePath);
        } else {
            System.out.println("Compressed data truncated due to size limit.");
        }
        endTime = System.nanoTime();

        //System.out.println((endTime - startTime)/1000000000.0); // Total time taken to complete processing
        // System.out.println((endTime - startTime)/1000000000.0); // Total time taken
        // to complete processing

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");
        outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }
@@ -336,19 +390,20 @@ public class crawler {
            byte[] compressedData = fis.readAllBytes();
            byte[] decompressedData = decompress(compressedData);
            compression_size = compressedData.length;
            // System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
            // System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
            // System.out.println("Compressed metadata size: " + compressedData.length + "
            // bytes");
            // System.out.println("Decompressed metadata size: " + decompressedData.length +
            // " bytes");
            trie.deserialize(decompressedData);
            System.out.println("Metadata loaded successfully.");
            return trie;
        } catch (IOException e) {
            System.err.println("Error reading metadata from file: " + e.getMessage());
            // System.err.println("Error reading metadata from file: " + e.getMessage());
            System.err.println("Creating new trie...");
            return new TrieNode();
        }
    }


    private static void writeToFile(byte[] compressedData, String filePath) {
        try (FileOutputStream fos = new FileOutputStream(filePath)) {
            fos.write(compressedData);
@@ -356,6 +411,7 @@ public class crawler {
            System.err.println("Error wr iting metadata to file: " + e.getMessage());
        }
    }

    // Output visited URLs as specified
    public void get_visited() {
        System.out.println("All of the visited websites:");