Commit 7c33b9bb authored by Moises Bensadon's avatar Moises Bensadon
Browse files

Merge branch '33-render-loading-bar' into '31-build-process-render-results'

Resolve "Render Loading Bar"

See merge request ec504/ec504_projects/group7!20
parents 605ed3aa 9504aa2f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
rm metadata.ser
rm SmallEnglish.ser
rm uncompressed-metadata.json
mvn clean install
mvn exec:java
 No newline at end of file
+42 −24
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;

import javax.sql.rowset.spi.SyncFactory;
import javax.swing.*;
import javax.swing.text.Highlighter;
import javax.swing.text.Highlighter.Highlight;

import dotterweide.editor.painter.HighlightPainter;

import java.awt.*;
// import java.awt.event.ActionEvent;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.StringBufferInputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.ArrayList;
import java.util.HashMap;
import javax.swing.*;
import javax.swing.text.*;
import java.awt.Color;
import java.awt.BorderLayout;
import java.awt.event.ActionEvent;
import java.util.zip.Deflater;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -32,12 +23,12 @@ import java.io.IOException;
public class MainApp extends JFrame {
    private final JTextField urlField;
    private JTextArea resultArea;
    private Highlighter.HighlightPainter myHighlightPainter;
    private final JButton runButton;
    private final JButton changeLanguageButton;
    private final JComboBox<String> moduleSelector;
    private JProgressBar progressBar = new JProgressBar(0, 100);

    private String languageFile = "English.ser";
    private String languageFile = "SmallEnglish.ser";

    public MainApp() {
        super("Language Correction Tool");
@@ -48,12 +39,10 @@ public class MainApp extends JFrame {

        // Check if metadata file exists
        if (!Files.exists(Paths.get(languageFile))) {
            String[] languages = {"English", "German", "Portuguese", "Italian"};
            String[] languages = {"SmallEnglish","English", "German", "Portuguese", "Italian"};
            String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of. \n(If you want to build from scratch just click Cancel)", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
            if (selectedLanguage != null) {
                languageFile = selectedLanguage + ".ser";
                crawler webCrawler = new crawler(languageFile);
                webCrawler.build(selectedLanguage);
                runBuilder(selectedLanguage);
            }
        }

@@ -70,10 +59,10 @@ public class MainApp extends JFrame {

            switch (selectedModule) {
                case "Web Crawler":
                    runCrawler(input);
                    runCrawler(input,10);
                    break;
                case "Reddit Crawler":
                    runCrawler("https://www.reddit.com/r/"+input);
                    runCrawler("https://www.reddit.com/r/"+input, 1);
                    break;
                case "File Checker":
                    runChecker(input,true);
@@ -91,6 +80,7 @@ public class MainApp extends JFrame {
                    JOptionPane.showMessageDialog(this, "Select a valid module");
            }
        });
        urlField.addActionListener(e -> runButton.doClick()); // On enter keydown it will click run
        changeLanguageButton = new JButton("Change Language");
        changeLanguageButton.addActionListener(e -> {
            String[] languages = {"English", "German", "Portuguese", "Italian"};
@@ -101,8 +91,7 @@ public class MainApp extends JFrame {
                // Check if metadata file exists
                String extraText = "";
                if (!Files.exists(Paths.get(languageFile))) {
                    crawler webCrawler = new crawler(languageFile);
                    webCrawler.build(selectedLanguage);
                    runBuilder(selectedLanguage);
                    extraText = "New metadata file created.\n";
                }

@@ -115,6 +104,7 @@ public class MainApp extends JFrame {
        northPanel.add(urlField, BorderLayout.CENTER);
        northPanel.add(runButton, BorderLayout.EAST);
        northPanel.add(changeLanguageButton, BorderLayout.NORTH);
        northPanel.add(progressBar, BorderLayout.SOUTH);

        // Result area
        resultArea = new JTextArea();
@@ -125,12 +115,29 @@ public class MainApp extends JFrame {
        add(scrollPane, BorderLayout.CENTER);
    }

    private void runCrawler(String input) {
    private void runBuilder(String selectedLanguage) {
        languageFile = selectedLanguage + ".ser";
        StringBuilder outputBuilder = new StringBuilder();
        crawler webCrawler = new crawler(languageFile, output -> {
            outputBuilder.append(output);
            SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
        }, progressBar);
        Thread builderThread = new Thread(() -> {
            webCrawler.build(selectedLanguage);
        });
        progressBar.setString("Building "+selectedLanguage+" metadata file...");
        progressBar.setValue(0);
        builderThread.start();
    }
    
    private void runCrawler(String input, Integer limit) {
        progressBar.setValue(0);
        progressBar.setString(null);
        StringBuilder outputBuilder = new StringBuilder();
        crawler webCrawler = new crawler(languageFile, output -> {
            outputBuilder.append(output);
            SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
        }, progressBar);
        if (input.startsWith("http")) {
            // Input is a single URL
            webCrawler.add_to_queue(input);
@@ -140,13 +147,13 @@ public class MainApp extends JFrame {
                List<String> lines = Files.readAllLines(Paths.get(input));
                lines.forEach(webCrawler::add_to_queue);
            } catch (Exception e) {
                resultArea.setText("Error reading file: " + e.getMessage());
                resultArea.setText("Error reading file/link. Please make sure to include http or .txt for link or file respectively: " + e.getMessage());
                return;
            }
        }
        // Create a new thread for running the crawler
        Thread crawlerThread = new Thread(() -> {
            webCrawler.crawl(5); // Adjust the limit (DEFAULT SET TO 1)
            webCrawler.crawl(limit); // Adjust the limit (DEFAULT SET TO 1)
        });
        // Start the crawler thread
        crawlerThread.start();
@@ -297,11 +304,17 @@ public class MainApp extends JFrame {
                content = input;
            Corrector corrector = new Corrector(languageFile);
            String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
            progressBar.setValue(0);
            progressBar.setStringPainted(true);
            progressBar.setString(null);
            progressBar.setMaximum(sentences.length*2);

            StringBuilder result = new StringBuilder();
            for (String sentence : sentences) {
                progressBar.setValue(progressBar.getValue() + 1);
                sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
                String[] corrected = corrector.correct(sentence);
                progressBar.setValue(progressBar.getValue() + 2);
                if (corrected.length == 0) {
                    result.append(sentence + " | No corrections found for this sentence.\n\n");
                    continue;
@@ -312,7 +325,7 @@ public class MainApp extends JFrame {
            }

            resultArea.setText(result.toString());

            // ISSUE #30 - Feedback for corrector
            // Add a pop up to input the best correction for each sentence
            TrieNode node = corrector.getDetector();
            boolean changeMade = false;
@@ -344,11 +357,16 @@ public class MainApp extends JFrame {
                }
                String selected = (String) JOptionPane.showInputDialog(this, "Select the best correction for the following sentence:\n  " + sentence, "Correction Selection", JOptionPane.PLAIN_MESSAGE, null, options, options[0]);
                if (selected != null) {
                    progressBar.setValue(0);
                    progressBar.setStringPainted(true);
                    progressBar.setString(null);
                    // Insert the selected correction into the trie for future reference
                    List<String> phrases = TextProcessor.extractPhrases(selected, 2, 3);
                    List<String> phrases = TextProcessor.extractPhrases(selected, 1, 3);
                    progressBar.setMaximum(phrases.size());
                    for (String phrase : phrases) {
                        String[] words = phrase.split(" ");
                        node.insert(words);
                        progressBar.setValue(progressBar.getValue() + 1);
                    }
                    changeMade = true;
                } else {
+56 −20
Original line number Diff line number Diff line
@@ -10,6 +10,9 @@ import org.jsoup.select.Elements;
import java.util.*;
import java.util.zip.Deflater;
import java.util.zip.Inflater;

import javax.swing.JProgressBar;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
@@ -56,9 +59,7 @@ public class crawler {
        }
        if (!file_url.isEmpty()) {
            if (web_crawler.is_username) {
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user
                                                                                   // page
                // System.out.println(file_url);
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
                web_crawler.add_to_queue(user_url);
            } else {
                try (FileReader f_read = new FileReader(file_url);
@@ -72,7 +73,7 @@ public class crawler {
        }

        // Start crawling
        final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 5; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);

        // Print url queue
@@ -98,6 +99,7 @@ public class crawler {
    private boolean is_username = false; // flag for provided (reddit) username
    private static final int MAXNGRAM = 3;
    private Consumer<String> outputCallback;
    private JProgressBar progressBar;

    public crawler(String file) {
        url_queue = new LinkedList<>();
@@ -111,8 +113,9 @@ public class crawler {
        // Estimate page count based on compressed file size
    }

    public crawler(String file, Consumer<String> outputCallback) {
    public crawler(String file, Consumer<String> outputCallback, JProgressBar progressBar) {
        this.outputCallback = outputCallback;
        this.progressBar = progressBar;
        url_queue = new LinkedList<>();
        visited_urls = new HashSet<>();

@@ -139,7 +142,7 @@ public class crawler {
            }
            try {
                System.out.println("Processing: " + cur_site);
                outputCallback.accept("\nProcessing: " + cur_site);
                outputCallback.accept("\n\nProcessing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data, isPost);
@@ -156,7 +159,9 @@ public class crawler {

    public void build(String language) {
        String corpus = "";
        if (language.equals("English")) {
        if (language.equals("SmallEnglish")) {
            corpus = "brownSmall.txt";
        } else if (language.equals("English")) {
            corpus = "brown.txt";
        } else if (language.equals("German")) {
            corpus = "germanSmall.txt";
@@ -203,7 +208,7 @@ public class crawler {
                while ((line = reader.readLine()) != null) {
                    doc.append(line);
                }
                System.out.println("File read successfully.");
                // System.out.println("File read successfully.");
                outputCallback.accept("\nFile read successfully.");
            }
            return doc;
@@ -226,9 +231,14 @@ public class crawler {
            is_username = false;
            Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile
            // Add all posts in overview to url_queue
            for (Element link : posts) {
            for (Element link : posts) { // for links in the text
                String link_url = link.attr("href");
                if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                // Strip the URL of any anchor tags or query tags
                if (link_url.contains("#"))
                    link_url = link_url.split("#")[0];
                if (link_url.contains("?"))
                    link_url = link_url.split("\\?")[0]; // ? is a special character in regex
                if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                    foundLinks++;
                    url_queue.add(link_url);
                }
@@ -237,8 +247,13 @@ public class crawler {
            Elements links = web_data.select("a[href]");
            for (Element link : links) {
                String link_url = link.attr("abs:href");
                // Strip the URL of any anchor tags
                if (link_url.contains("#"))
                    link_url = link_url.split("#")[0];
                if (link_url.contains("?"))
                    link_url = link_url.split("\\?")[0]; // ? is a special character in regex
                // System.out.println("Found link: " + link_url);
                if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                    // System.out.println("Adding link to queue: " + link_url);
                    foundLinks++;
                    url_queue.add(link_url);
@@ -259,11 +274,15 @@ public class crawler {
            // Break the page text into manageable chunks, considering sentences
            List<String> chunks = splitTextIntoChunks(web_data.text());
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                // System.out.println("No text found on page.");
                outputCallback.accept("\nNo text found on page.");
                return;
            }
            // System.out.println(chunks.size());
            if (progressBar != null) {
                progressBar.setMaximum(1024);
                progressBar.setStringPainted(true);
            }

            for (String chunk : chunks) {
                // chunkCount++;
@@ -274,9 +293,12 @@ public class crawler {
                extractWordUsage(chunk, wordUsage);
                uncompressedData = wordUsage.serialize();
                compressedData = compress(uncompressedData);
                if (progressBar != null) {
                    progressBar.setValue(compressedData.length - compression_size);
                }
                if ((compressedData.length - compression_size > 1024)) {
                    System.out.println("Previous compressed data size: " + compression_size
                            + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
                    System.out.println("Previous size: " + compression_size
                            + " bytes. Current size: " + compressedData.length + " bytes. Delta: "
                            + (compressedData.length - compression_size) + " bytes.");
                    outputCallback.accept("\nPrevious compressed data size: " + compression_size
                            + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
@@ -286,14 +308,33 @@ public class crawler {
                    sizeLimitExceeded = true;
                    uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
                    compressedData = compress(uncompressedData); // Recompress the reverted state
                    if (progressBar != null) {
                        progressBar.setValue(1024);
                    }
                    // Output # of links found in page
                    System.out.println("# of additional links found: " + foundLinks + "\n");
                    outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");            
                    break; // Stop processing further chunks
                }
            }
        } else {
            if (progressBar != null) {
                progressBar.setMaximum(3);
                progressBar.setStringPainted(true);
                progressBar.setValue(0);
            }
            extractWordUsage(web_data.text().replaceAll("\\p{Punct}", ""), wordUsage);
            // System.out.println("Ngrams built successfully. for size:"+MAXNGRAM);
            if (progressBar != null)
                progressBar.setValue(1);
            uncompressedData = wordUsage.serialize();
            if (progressBar != null)
                progressBar.setValue(2);
            compressedData = compress(uncompressedData);
            if (progressBar != null) {
                progressBar.setValue(3);
                progressBar.setString("Done building inital trie");
            }
            build_off_corpus = false; // if there are urls to read it should still be able to read them
        }

@@ -318,17 +359,12 @@ public class crawler {

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes, Uncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");
        outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }

+50011 −0

File added.

Preview size limit exceeded, changes collapsed.