Commit 566b6762 authored by Moises Bensadon's avatar Moises Bensadon
Browse files

implemented realtime processing for gui #11 via Consumers

parent 2d528d61
Loading
Loading
Loading
Loading
+35 −20
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import java.util.List;

public class MainApp extends JFrame {
    private final JTextField urlField;
    private final JTextArea resultArea;
    private JTextArea resultArea;
    private final JButton runButton;
    private final JComboBox<String> moduleSelector;

@@ -22,8 +22,20 @@ public class MainApp extends JFrame {
        setLocationRelativeTo(null);
        setLayout(new BorderLayout());

        // Check if metadata file exists
        if (!Files.exists(Paths.get("metadata.ser"))) {
            String[] languages = {"en", "es", "pt", "it"};
            String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of.", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
            if (selectedLanguage == null) {
                System.exit(0);
            }
            crawler webCrawler = new crawler();
            webCrawler.build(selectedLanguage);

        }

        // Module selector
        String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "Checker", "Corrector"};
        String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "File Checker", "File Corrector", "Text Checker", "Text Corrector"};
        moduleSelector = new JComboBox<>(modules);

        // User inout entry field and run button
@@ -40,11 +52,17 @@ public class MainApp extends JFrame {
                case "Reddit Crawler":
                    runCrawler("https://www.reddit.com/r/"+input);
                    break;
                case "Checker":
                    runChecker(input);
                case "File Checker":
                    runChecker(input,true);
                    break;
                case "File Corrector":
                    runCorrector(input,true);
                    break;
                case "Text Checker":
                    runChecker(input,false);
                    break;
                case "Corrector":
                    runCorrector(input);
                case "Text Corrector":
                    runCorrector(input,false);
                    break;
                default:
                    JOptionPane.showMessageDialog(this, "Select a valid module");
@@ -65,7 +83,11 @@ public class MainApp extends JFrame {
    }

    private void runCrawler(String input) {
        crawler webCrawler = new crawler();
        StringBuilder outputBuilder = new StringBuilder();
        crawler webCrawler = new crawler(output -> {
            outputBuilder.append(output);
            SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
        });
        if (input.startsWith("http")) {
            // Input is a single URL
            webCrawler.add_to_queue(input);
@@ -79,19 +101,12 @@ public class MainApp extends JFrame {
                return;
            }
        }

        // Redirect output to the GUI
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
        PrintStream printStream = new PrintStream(outputStream);
        PrintStream oldStream = System.out;
        System.setOut(printStream);

        // Run the crawler
        // Create a new thread for running the crawler
        Thread crawlerThread = new Thread(() -> {
            webCrawler.crawl(5); // Adjust the limit (DEFAULT SET TO 1)

        // Restore standard output and update the GUI
        System.setOut(oldStream);
        resultArea.setText(outputStream.toString());
        });
        // Start the crawler thread
        crawlerThread.start();
    }

    private void runChecker(String filePath) {
+45 −7
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.util.function.Consumer;

import java.lang.Math;

@@ -39,9 +40,11 @@ public class crawler {
                web_crawler.debug = true;
            } else if (args[i].equals("--build")) {
                web_crawler.build_off_corpus = true;
                System.out.println("Building off-corpus...");
                web_crawler.build(args[i + 1]);
            }
        } 
        if (!web_crawler.build_off_corpus) {
        if (!file_url.isEmpty()) {
            try (FileReader f_read = new FileReader(file_url);
                BufferedReader buf_read = new BufferedReader(f_read)) {
                String url_line;
@@ -68,6 +71,8 @@ public class crawler {
    private boolean debug = false; // flag that outputs uncompressed json showing the trie
    private boolean build_off_corpus = false;
    private static final int MAXNGRAM = 3;
    private Consumer<String> outputCallback;


    public crawler() {
        url_queue = new LinkedList<>();
@@ -79,16 +84,21 @@ public class crawler {
        // Estimate page count based on compressed file size
    }

    public crawler(Consumer<String> outputCallback) {
        this.outputCallback = outputCallback;
        url_queue = new LinkedList<>();
        visited_urls = new HashSet<>();
        
        // Load trie from file        
        wordUsage = loadFile(filePath);
    }

    public void add_to_queue(String url) {
        url_queue.add(url);
    }

    public void crawl(int maxPages) {
        int pageCount = 0;
        if (build_off_corpus) {
            System.out.println("Building off corpus...");
            processPage(get_file_text("brown.txt")); // TODO: extend to multiple files i.e. one per language
        }
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();
            if (cur_site == null || visited_urls.contains(cur_site)) {
@@ -96,6 +106,7 @@ public class crawler {
            }
            try {
                System.out.println("Processing: " + cur_site);
                outputCallback.accept("\nProcessing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data);
@@ -107,6 +118,24 @@ public class crawler {
            }
        }
        System.out.println("Total pages visited: " + pageCount);
        outputCallback.accept("\nTotal pages visited: " + pageCount);
    }    

    public void build(String language) {
        String corpus = "";
        if (language.equals("en")) {
            corpus = "brown.txt";
        } else if (language.equals("es")) {
            corpus = "es.txt";
        } else if (language.equals("it")) {
            corpus = "it.txt";
        } else if (language.equals("pt")) {
            corpus = "pt.txt";
        } else {
            System.err.println("Unsupported language: " + language);
            return;
        }
        processPage(get_file_text(corpus));
    }

    private Document get_web_data(String url) throws IOException {
@@ -136,6 +165,7 @@ public class crawler {
                    doc.append(line);
                }
                System.out.println("File read successfully.");
                outputCallback.accept("\nFile read successfully.");
            }
            return doc;
        } catch (IOException e) {
@@ -169,6 +199,7 @@ public class crawler {
            List<String> chunks = splitTextIntoChunks(web_data.text());
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                outputCallback.accept("\nNo text found on page.");
                return;
            }
            for (String chunk : chunks) {
@@ -180,7 +211,9 @@ public class crawler {
                compressedData = compress(uncompressedData);
                if ((compressedData.length - compression_size > 1024) ) {
                    System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
                    outputCallback.accept("\nPrevious compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Size limit exceeded. Reverting to previous chunk.");
                    outputCallback.accept("\nSize limit exceeded. Reverting to previous chunk.");
                    sizeLimitExceeded = true;
                    uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
                    compressedData = compress(uncompressedData); // Recompress the reverted state
@@ -205,6 +238,7 @@ public class crawler {
        writeToFile(compressedData, filePath);
        if (!sizeLimitExceeded) {
            System.out.println("Compressed tree exported successfully to: " + filePath);
            outputCallback.accept("\nCompressed tree exported successfully to: " + filePath);
        } else {
            System.out.println("Compressed data truncated due to size limit.");
        }
@@ -214,12 +248,16 @@ public class crawler {

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length()/((endTime - startTime)/1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");
        outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }
@@ -300,7 +338,7 @@ public class crawler {
            System.out.println("Metadata loaded successfully.");
            return trie;
        } catch (IOException e) {
            System.err.println("Error reading metadata from file: " + e.getMessage());
            // System.err.println("Error reading metadata from file: " + e.getMessage());
            System.err.println("Creating new trie...");
            return new TrieNode();
        }