Merge branch '13-gui' into 'master' (3c7a69e1) · Commits · EC504 Spring 2024 Group Projects / Group7

checker_test_file.txt

+1 −1

Original line number	Diff line number	Diff line
		This is unambiguous. This is word odd so choice. Sorry for the inconvenience.
		Kenya, officially the Republic of Kenya
		No newline at end of file
		Kenya, officially the Republic of Kenya.
		No newline at end of file

pom.xml

+6 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,12 @@
		<artifactId>jsoup</artifactId>
		<version>1.15.3</version>
		</dependency>

		<dependency>
		<groupId>de.sciss</groupId>
		<artifactId>dotterweide-ui_2.12</artifactId>
		<version>0.4.3</version>
		</dependency>
		</dependencies>

		</project>
		No newline at end of file

src/main/java/edu/bu/LanguageCorrection/Checker.java

+2 −2

Original line number	Diff line number	Diff line
		package edu.bu.LanguageCorrection;

		import java.io.IOException;
		import java.nio.file.Files;
		import java.nio.file.Paths;
		@@ -9,7 +10,6 @@ import java.util.zip.Inflater;
		import java.io.ByteArrayOutputStream;
		import java.io.FileInputStream;


		public class Checker {
		public void analyze(String text) {
		List<String> sentences = TextProcessor.extractSentences(text);

src/main/java/edu/bu/LanguageCorrection/MainApp.java

0 → 100644

+240 −0

Original line number	Diff line number	Diff line
		package edu.bu.LanguageCorrection;

		import javax.sql.rowset.spi.SyncFactory;
		import javax.swing.*;
		import javax.swing.text.Highlighter;
		import javax.swing.text.Highlighter.Highlight;

		import dotterweide.editor.painter.HighlightPainter;

		import java.awt.*;
		// import java.awt.event.ActionEvent;
		import java.io.ByteArrayOutputStream;
		import java.io.PrintStream;
		import java.nio.file.Files;
		import java.nio.file.Paths;
		import java.util.List;
		import java.util.Map;
		import javax.swing.*;
		import javax.swing.text.*;
		import java.awt.Color;
		import java.awt.BorderLayout;
		import java.awt.event.ActionEvent;

		public class MainApp extends JFrame {
		private final JTextField urlField;
		private JTextArea resultArea;
		private Highlighter.HighlightPainter myHighlightPainter;
		private final JButton runButton;
		private final JComboBox<String> moduleSelector;

		public MainApp() {
		super("Language Correction Tool");
		setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
		setSize(800, 600);
		setLocationRelativeTo(null);
		setLayout(new BorderLayout());

		// Check if metadata file exists
		if (!Files.exists(Paths.get("metadata.ser"))) {
		String[] languages = {"en", "es", "pt", "it"};
		String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of. \n(If you want to build from scratch just click Cancel)", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
		if (selectedLanguage != null) {
		crawler webCrawler = new crawler();
		webCrawler.build(selectedLanguage);
		}
		}

		// Module selector
		String[] modules = {"Select Module", "Web Crawler", "Reddit Crawler", "File Checker", "File Corrector", "Text Checker", "Text Corrector"};
		moduleSelector = new JComboBox<>(modules);

		// User inout entry field and run button
		urlField = new JTextField();
		runButton = new JButton("Run");
		runButton.addActionListener(e -> {
		String selectedModule = (String) moduleSelector.getSelectedItem();
		String input = urlField.getText().trim();

		switch (selectedModule) {
		case "Web Crawler":
		runCrawler(input);
		break;
		case "Reddit Crawler":
		runCrawler("https://www.reddit.com/r/"+input);
		break;
		case "File Checker":
		runChecker(input,true);
		break;
		case "File Corrector":
		runCorrector(input,true);
		break;
		case "Text Checker":
		runChecker(input,false);
		break;
		case "Text Corrector":
		runCorrector(input,false);
		break;
		default:
		JOptionPane.showMessageDialog(this, "Select a valid module");
		}
		});
		JPanel northPanel = new JPanel(new BorderLayout());
		northPanel.add(moduleSelector, BorderLayout.WEST);
		northPanel.add(urlField, BorderLayout.CENTER);
		northPanel.add(runButton, BorderLayout.EAST);

		// Result area
		resultArea = new JTextArea();
		resultArea.setEditable(false);
		JScrollPane scrollPane = new JScrollPane(resultArea);

		add(northPanel, BorderLayout.NORTH);
		add(scrollPane, BorderLayout.CENTER);
		}

		private void runCrawler(String input) {
		StringBuilder outputBuilder = new StringBuilder();
		crawler webCrawler = new crawler(output -> {
		outputBuilder.append(output);
		SwingUtilities.invokeLater(() -> resultArea.setText(outputBuilder.toString()));
		});
		if (input.startsWith("http")) {
		// Input is a single URL
		webCrawler.add_to_queue(input);
		} else {
		// Input is file path
		try {
		List<String> lines = Files.readAllLines(Paths.get(input));
		lines.forEach(webCrawler::add_to_queue);
		} catch (Exception e) {
		resultArea.setText("Error reading file: " + e.getMessage());
		return;
		}
		}
		// Create a new thread for running the crawler
		Thread crawlerThread = new Thread(() -> {
		webCrawler.crawl(5); // Adjust the limit (DEFAULT SET TO 1)
		});
		// Start the crawler thread
		crawlerThread.start();
		}

		private void runChecker(String text, boolean isFile) {
		try {
		String content;
		if (isFile) {
		content = new String(Files.readAllBytes(Paths.get(text)));
		} else {
		content = text;
		}

		// Assume content is already properly split into sentences here
		List<String> sentences = TextProcessor.extractSentences(content); // Use a method to split into sentences

		Checker checker = new Checker();
		StringBuilder result = new StringBuilder();

		ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
		PrintStream printStream = new PrintStream(outputStream);
		PrintStream originalOut = System.out;

		String[] worstPhrases = new String[sentences.size()];

		for (String sentence : sentences) {
		System.setOut(printStream);
		checker.analyze(sentence); // Analyze each sentence separately

		// Reset System.out
		System.out.flush();
		System.setOut(originalOut);

		// Capture the output into a string
		String output = outputStream.toString();
		outputStream.reset(); // Clear the output stream for the next sentence
		//System.out.println(output);

		// Parsing the output to get phrases and their scores
		String[] lines = output.split("\n");
		double lowestScore = Double.MAX_VALUE;
		String worstPhrase = null;

		for (String line : lines) {
		if (line.startsWith("\"phrases\":")) {
		//System.out.println(line);
		line = line.replace("\"phrases\":", "").replace("{", "").replace("}", "").trim();
		//System.out.println(line);
		String[] phrases = line.split(",");
		//System.out.println(line);
		for (String phrase : phrases) {
		String[] parts = phrase.trim().split(":");
		double phraseScore = parts[1].trim().equals("null") ? 0
		: Double.parseDouble(parts[1].trim());
		if (phraseScore < lowestScore) {
		lowestScore = phraseScore;
		worstPhrase = parts[0].trim();
		worstPhrases[sentences.indexOf(sentence)] = worstPhrase;
		}
		}
		}
		}

		if (worstPhrase != null) {
		// Append to the result with annotations
		result.append("\nSentence: ").append(sentence)
		.append("\n>> Worst Phrase: ").append(worstPhrase)
		.append(" (Score: ").append(lowestScore).append(")\n");
		}
		}

		resultArea.setText(result.toString()); // Display the annotated results in the JTextArea

		// Highlight the worst phrase in each sentence
		Highlighter highlighter = resultArea.getHighlighter();
		Highlighter.HighlightPainter painter = new DefaultHighlighter.DefaultHighlightPainter(Color.YELLOW);
		for (String phrase : worstPhrases) {
		phrase = phrase.replaceAll("\"", "");
		//System.out.println(phrase);
		if (phrase != null) {
		//System.out.println(resultArea.getText());
		int start = resultArea.getText().indexOf(phrase);
		if (start != -1) {
		int end = start + phrase.length();
		try {
		highlighter.addHighlight(start, end, painter);
		} catch (BadLocationException e) {
		e.printStackTrace();
		}
		} else {
		System.out.println("Text not found");
		}
		}
		}
		} catch (Exception e) {
		//throw new RuntimeException(e);
		resultArea.setText("Error: " + e.getMessage());
		}
		}

		private void runCorrector(String input, boolean isFile) {
		try {
		String content;
		if (isFile)
		content= new String(Files.readAllBytes(Paths.get(input)));
		else
		content = input;
		Corrector corrector = new Corrector();
		String corrected = corrector.correct(content);
		resultArea.setText("Corrected Text:\n" + corrected);
		} catch (Exception e) {
		resultArea.setText("Error: " + e.getMessage());
		}
		}

		public static void main(String[] args) {
		SwingUtilities.invokeLater(() -> {
		MainApp mainApp = new MainApp();
		mainApp.setVisible(true);
		});
		}
		}

src/main/java/edu/bu/LanguageCorrection/crawler.java

+109 −53

Original line number	Diff line number	Diff line
		@@ -16,10 +16,13 @@ import java.io.FileInputStream;
		import java.io.FileOutputStream;
		import java.io.FileReader;
		import java.io.IOException;
		import java.util.Arrays;
		import java.util.List;
		import java.util.ArrayList;
		import java.util.function.Consumer;

		import java.lang.Math;


		public class crawler {

		public static void main(String[] args) throws IOException {
		@@ -34,15 +37,17 @@ public class crawler {
		} else if (args[i].equals("--debug")) {
		web_crawler.debug = true;
		} else if (args[i].equals("--build")) {
		web_crawler.build_off_corpus = true;
		System.out.println("Building off-corpus...");
		web_crawler.build(args[i + 1]);
		} else if (args[i].equals("--social")) {
		web_crawler.is_username = true;
		file_url = args[i + 1]; // Reuse variable to hold username
		}
		}
		if (!web_crawler.build_off_corpus) {
		if (!file_url.isEmpty()) {
		if (web_crawler.is_username) {
		String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
		String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user
		// page
		// System.out.println(file_url);
		web_crawler.add_to_queue(user_url);
		} else {
		@@ -66,6 +71,7 @@ public class crawler {
		// Print visited URLs
		web_crawler.get_visited();
		}

		// public void get_queue(){
		// for (String url : url_queue) {
		// System.out.println(url);
		@@ -81,6 +87,7 @@ public class crawler {
		private boolean build_off_corpus = false;
		private boolean is_username = false; // flag for provided (reddit) username
		private static final int MAXNGRAM = 3;
		private Consumer<String> outputCallback;

		public crawler() {
		url_queue = new LinkedList<>();
		@@ -88,20 +95,26 @@ public class crawler {

		// Load trie from file
		wordUsage = loadFile(filePath);
		this.outputCallback = System.out::println;

		// Estimate page count based on compressed file size
		}

		public crawler(Consumer<String> outputCallback) {
		this.outputCallback = outputCallback;
		url_queue = new LinkedList<>();
		visited_urls = new HashSet<>();

		// Load trie from file
		wordUsage = loadFile(filePath);
		}

		public void add_to_queue(String url) {
		url_queue.add(url);
		}

		public void crawl(int maxPages) {
		int pageCount = 0;
		if (build_off_corpus) {
		System.out.println("Building off corpus...");
		processPage(get_file_text("brown.txt"), false); // TODO: extend to multiple files i.e. one per language
		}
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();

		@@ -114,6 +127,7 @@ public class crawler {
		}
		try {
		System.out.println("Processing: " + cur_site);
		outputCallback.accept("\nProcessing: " + cur_site);
		Document web_data = get_web_data(cur_site);
		if (web_data != null) {
		processPage(web_data, isPost);
		@@ -125,12 +139,35 @@ public class crawler {
		}
		}
		System.out.println("Total pages visited: " + pageCount);
		outputCallback.accept("\nTotal pages visited: " + pageCount);
		}

		public void build(String language) {
		String corpus = "";
		if (language.equals("en")) {
		corpus = "brown.txt";
		} else if (language.equals("es")) {
		corpus = "es.txt";
		} else if (language.equals("it")) {
		corpus = "it.txt";
		} else if (language.equals("pt")) {
		corpus = "pt.txt";
		} else {
		System.err.println("Unsupported language: " + language);
		return;
		}

		build_off_corpus = true;

		processPage(get_file_text(corpus), false);
		}

		private Document get_web_data(String url) throws IOException {
		//use execute() in order to receive a response object -> allows status code checking
		// use execute() in order to receive a response object -> allows status code
		// checking
		Connection.Response req_response = Jsoup.connect(url)
		.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
		.userAgent(
		"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
		.referrer("http://www.google.com")
		.execute();

		@@ -143,7 +180,8 @@ public class crawler {
		}

		private Document get_file_text(String filename) {
		// takes filename as input, reads it from src/main/resources and returns the text as a Document object
		// takes filename as input, reads it from src/main/resources and returns the
		// text as a Document object
		try {
		Document doc = new Document("");
		doc.title(filename);
		@@ -154,6 +192,7 @@ public class crawler {
		doc.append(line);
		}
		System.out.println("File read successfully.");
		outputCallback.accept("\nFile read successfully.");
		}
		return doc;
		} catch (IOException e) {
		@@ -198,7 +237,8 @@ public class crawler {
		// Ignore for now - optimizes body + comment content retrieval
		// if(isPost) {
		// Get proper tags + queries
		// Element block = web_data.selectFirst("shreddit-post.block.xs:mt-xs.xs:-mx-xs.xs:px-xs.xs:rounded-[16px].pt-xs.nd:pt-xs.bg-[color:var(--shreddit-content-background)].box-border.mb-xs.nd:visible.nd:pb-2xl");
		// Element block =
		// web_data.selectFirst("shreddit-post.block.xs:mt-xs.xs:-mx-xs.xs:px-xs.xs:rounded-[16px].pt-xs.nd:pt-xs.bg-[color:var(--shreddit-content-background)].box-border.mb-xs.nd:visible.nd:pb-2xl");
		// String body = block.select("div.text-neutral-content.text-body").text();
		// System.out.println(body);
		// }
		@@ -208,6 +248,7 @@ public class crawler {
		List<String> chunks = splitTextIntoChunks(web_data.text());
		if (chunks.isEmpty()) {
		System.out.println("No text found on page.");
		outputCallback.accept("\nNo text found on page.");
		return;
		}
		// System.out.println(chunks.size());
		@@ -215,14 +256,21 @@ public class crawler {
		for (String chunk : chunks) {
		// chunkCount++;
		// System.out.println(chunk);
		// System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
		// System.out.println("Current compressed size: "+ compressedData.length+".
		// Processing chunk " + chunkCount + " of " + chunks.size());
		previousUncompressedData = uncompressedData.clone();
		extractWordUsage(chunk, wordUsage);
		uncompressedData = wordUsage.serialize();
		compressedData = compress(uncompressedData);
		if ((compressedData.length - compression_size > 1024)) {
		System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
		System.out.println("Previous compressed data size: " + compression_size
		+ " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
		+ (compressedData.length - compression_size) + " bytes.");
		outputCallback.accept("\nPrevious compressed data size: " + compression_size
		+ " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "
		+ (compressedData.length - compression_size) + " bytes.");
		System.out.println("Size limit exceeded. Reverting to previous chunk.");
		outputCallback.accept("\nSize limit exceeded. Reverting to previous chunk.");
		sizeLimitExceeded = true;
		uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
		compressedData = compress(uncompressedData); // Recompress the reverted state
		@@ -247,21 +295,27 @@ public class crawler {
		writeToFile(compressedData, filePath);
		if (!sizeLimitExceeded) {
		System.out.println("Compressed tree exported successfully to: " + filePath);
		outputCallback.accept("\nCompressed tree exported successfully to: " + filePath);
		} else {
		System.out.println("Compressed data truncated due to size limit.");
		}
		endTime = System.nanoTime();

		//System.out.println((endTime - startTime)/1000000000.0); // Total time taken to complete processing
		// System.out.println((endTime - startTime)/1000000000.0); // Total time taken
		// to complete processing

		// Output sizes of both compressed and uncompressed data for reference
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
		System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
		outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
		// Output rate of processing
		double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0);
		System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
		outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
		// Output # of links found in page
		System.out.println("# of additional links found: " + foundLinks + "\n");
		outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

		compression_size = compressedData.length;
		}
		@@ -336,19 +390,20 @@ public class crawler {
		byte[] compressedData = fis.readAllBytes();
		byte[] decompressedData = decompress(compressedData);
		compression_size = compressedData.length;
		// System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		// System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
		// System.out.println("Compressed metadata size: " + compressedData.length + "
		// bytes");
		// System.out.println("Decompressed metadata size: " + decompressedData.length +
		// " bytes");
		trie.deserialize(decompressedData);
		System.out.println("Metadata loaded successfully.");
		return trie;
		} catch (IOException e) {
		System.err.println("Error reading metadata from file: " + e.getMessage());
		// System.err.println("Error reading metadata from file: " + e.getMessage());
		System.err.println("Creating new trie...");
		return new TrieNode();
		}
		}


		private static void writeToFile(byte[] compressedData, String filePath) {
		try (FileOutputStream fos = new FileOutputStream(filePath)) {
		fos.write(compressedData);
		@@ -356,6 +411,7 @@ public class crawler {
		System.err.println("Error wr iting metadata to file: " + e.getMessage());
		}
		}

		// Output visited URLs as specified
		public void get_visited() {
		System.out.println("All of the visited websites:");