implemented realtime processing for gui #11 via Consumers (566b6762) · Commits · EC504 Spring 2024 Group Projects / Group7

src/main/java/edu/bu/LanguageCorrection/MainApp.java

+35 −20

Original line number	Diff line number	Diff line
		@@ -11,7 +11,7 @@ import java.util.List;

		public class MainApp extends JFrame {
		private final JTextField urlField;
		private final JTextArea resultArea;
		private JTextArea resultArea;
		private final JButton runButton;
		private final JComboBox<String> moduleSelector;

		@@ -22,8 +22,20 @@ public class MainApp extends JFrame {
		setLocationRelativeTo(null);
		setLayout(new BorderLayout());

		// Check if metadata file exists
		if (!Files.exists(Paths.get("metadata.ser"))) {
		String[] languages = {"en", "es", "pt", "it"};
		String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of.", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
		if (selectedLanguage == null) {
		System.exit(0);
		}
		crawler webCrawler = new crawler();
		webCrawler.build(selectedLanguage);

		}

		// Module selector
		String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "Checker", "Corrector"};
		String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "File Checker", "File Corrector", "Text Checker", "Text Corrector"};
		moduleSelector = new JComboBox<>(modules);

		// User inout entry field and run button
		@@ -40,11 +52,17 @@ public class MainApp extends JFrame {
		case "Reddit Crawler":
		runCrawler("https://www.reddit.com/r/"+input);
		break;
		case "Checker":
		runChecker(input);
		case "File Checker":
		runChecker(input,true);
		break;
		case "File Corrector":
		runCorrector(input,true);
		break;
		case "Text Checker":
		runChecker(input,false);
		break;
		case "Corrector":
		runCorrector(input);
		case "Text Corrector":
		runCorrector(input,false);
		break;
		default:
		JOptionPane.showMessageDialog(this, "Select a valid module");
		@@ -65,7 +83,11 @@ public class MainApp extends JFrame {
		}

		private void runCrawler(String input) {
		crawler webCrawler = new crawler();
		StringBuilder outputBuilder = new StringBuilder();
		crawler webCrawler = new crawler(output -> {
		outputBuilder.append(output);
		SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
		});
		if (input.startsWith("http")) {
		// Input is a single URL
		webCrawler.add_to_queue(input);
		@@ -79,19 +101,12 @@ public class MainApp extends JFrame {
		return;
		}
		}

		// Redirect output to the GUI
		ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
		PrintStream printStream = new PrintStream(outputStream);
		PrintStream oldStream = System.out;
		System.setOut(printStream);

		// Run the crawler
		// Create a new thread for running the crawler
		Thread crawlerThread = new Thread(() -> {
		webCrawler.crawl(5); // Adjust the limit (DEFAULT SET TO 1)

		// Restore standard output and update the GUI
		System.setOut(oldStream);
		resultArea.setText(outputStream.toString());
		});
		// Start the crawler thread
		crawlerThread.start();
		}

		private void runChecker(String filePath) {

src/main/java/edu/bu/LanguageCorrection/crawler.java

+45 −7

Original line number	Diff line number	Diff line
		@@ -20,6 +20,7 @@ import java.io.IOException;
		import java.util.Arrays;
		import java.util.List;
		import java.util.ArrayList;
		import java.util.function.Consumer;

		import java.lang.Math;

		@@ -39,9 +40,11 @@ public class crawler {
		web_crawler.debug = true;
		} else if (args[i].equals("--build")) {
		web_crawler.build_off_corpus = true;
		System.out.println("Building off-corpus...");
		web_crawler.build(args[i + 1]);
		}
		}
		if (!web_crawler.build_off_corpus) {
		if (!file_url.isEmpty()) {
		try (FileReader f_read = new FileReader(file_url);
		BufferedReader buf_read = new BufferedReader(f_read)) {
		String url_line;
		@@ -68,6 +71,8 @@ public class crawler {
		private boolean debug = false; // flag that outputs uncompressed json showing the trie
		private boolean build_off_corpus = false;
		private static final int MAXNGRAM = 3;
		private Consumer<String> outputCallback;


		public crawler() {
		url_queue = new LinkedList<>();
		@@ -79,16 +84,21 @@ public class crawler {
		// Estimate page count based on compressed file size
		}

		public crawler(Consumer<String> outputCallback) {
		this.outputCallback = outputCallback;
		url_queue = new LinkedList<>();
		visited_urls = new HashSet<>();

		// Load trie from file
		wordUsage = loadFile(filePath);
		}

		public void add_to_queue(String url) {
		url_queue.add(url);
		}

		public void crawl(int maxPages) {
		int pageCount = 0;
		if (build_off_corpus) {
		System.out.println("Building off corpus...");
		processPage(get_file_text("brown.txt")); // TODO: extend to multiple files i.e. one per language
		}
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();
		if (cur_site == null \|\| visited_urls.contains(cur_site)) {
		@@ -96,6 +106,7 @@ public class crawler {
		}
		try {
		System.out.println("Processing: " + cur_site);
		outputCallback.accept("\nProcessing: " + cur_site);
		Document web_data = get_web_data(cur_site);
		if (web_data != null) {
		processPage(web_data);
		@@ -107,6 +118,24 @@ public class crawler {
		}
		}
		System.out.println("Total pages visited: " + pageCount);
		outputCallback.accept("\nTotal pages visited: " + pageCount);
		}

		public void build(String language) {
		String corpus = "";
		if (language.equals("en")) {
		corpus = "brown.txt";
		} else if (language.equals("es")) {
		corpus = "es.txt";
		} else if (language.equals("it")) {
		corpus = "it.txt";
		} else if (language.equals("pt")) {
		corpus = "pt.txt";
		} else {
		System.err.println("Unsupported language: " + language);
		return;
		}
		processPage(get_file_text(corpus));
		}

		private Document get_web_data(String url) throws IOException {
		@@ -136,6 +165,7 @@ public class crawler {
		doc.append(line);
		}
		System.out.println("File read successfully.");
		outputCallback.accept("\nFile read successfully.");
		}
		return doc;
		} catch (IOException e) {
		@@ -169,6 +199,7 @@ public class crawler {
		List<String> chunks = splitTextIntoChunks(web_data.text());
		if (chunks.isEmpty()) {
		System.out.println("No text found on page.");
		outputCallback.accept("\nNo text found on page.");
		return;
		}
		for (String chunk : chunks) {
		@@ -180,7 +211,9 @@ public class crawler {
		compressedData = compress(uncompressedData);
		if ((compressedData.length - compression_size > 1024) ) {
		System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
		outputCallback.accept("\nPrevious compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
		System.out.println("Size limit exceeded. Reverting to previous chunk.");
		outputCallback.accept("\nSize limit exceeded. Reverting to previous chunk.");
		sizeLimitExceeded = true;
		uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
		compressedData = compress(uncompressedData); // Recompress the reverted state
		@@ -205,6 +238,7 @@ public class crawler {
		writeToFile(compressedData, filePath);
		if (!sizeLimitExceeded) {
		System.out.println("Compressed tree exported successfully to: " + filePath);
		outputCallback.accept("\nCompressed tree exported successfully to: " + filePath);
		} else {
		System.out.println("Compressed data truncated due to size limit.");
		}
		@@ -214,12 +248,16 @@ public class crawler {

		// Output sizes of both compressed and uncompressed data for reference
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
		System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
		outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
		// Output rate of processing
		double processingRate = web_data.text().length()/((endTime - startTime)/1000000000.0);
		System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
		outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
		// Output # of links found in page
		System.out.println("# of additional links found: " + foundLinks + "\n");
		outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

		compression_size = compressedData.length;
		}
		@@ -300,7 +338,7 @@ public class crawler {
		System.out.println("Metadata loaded successfully.");
		return trie;
		} catch (IOException e) {
		System.err.println("Error reading metadata from file: " + e.getMessage());
		// System.err.println("Error reading metadata from file: " + e.getMessage());
		System.err.println("Creating new trie...");
		return new TrieNode();
		}