fix url strip of anchors and query params (3aa55921) · Commits · EC504 Spring 2024 Group Projects / Group7

src/main/java/edu/bu/LanguageCorrection/MainApp.java

+2 −2

Original line number	Diff line number	Diff line
		@@ -146,7 +146,7 @@ public class MainApp extends JFrame {
		List<String> lines = Files.readAllLines(Paths.get(input));
		lines.forEach(webCrawler::add_to_queue);
		} catch (Exception e) {
		resultArea.setText("Error reading file: " + e.getMessage());
		resultArea.setText("Error reading file/link. Please make sure to include http or .txt for link or file respectively: " + e.getMessage());
		return;
		}
		}
		@@ -324,7 +324,7 @@ public class MainApp extends JFrame {
		}

		resultArea.setText(result.toString());

		// ISSUE #30 - Feedback for corrector
		// Add a pop up to input the best correction for each sentence
		TrieNode node = corrector.getDetector();
		boolean changeMade = false;

src/main/java/edu/bu/LanguageCorrection/crawler.java

+27 −17

Original line number	Diff line number	Diff line
		@@ -10,6 +10,9 @@ import org.jsoup.select.Elements;
		import java.util.*;
		import java.util.zip.Deflater;
		import java.util.zip.Inflater;

		import javax.swing.JProgressBar;

		import java.io.BufferedReader;
		import java.io.ByteArrayOutputStream;
		import java.io.FileInputStream;
		@@ -56,9 +59,7 @@ public class crawler {
		}
		if (!file_url.isEmpty()) {
		if (web_crawler.is_username) {
		String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user
		// page
		// System.out.println(file_url);
		String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
		web_crawler.add_to_queue(user_url);
		} else {
		try (FileReader f_read = new FileReader(file_url);
		@@ -72,7 +73,7 @@ public class crawler {
		}

		// Start crawling
		final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		final int crawlLimit = 5; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		web_crawler.crawl(crawlLimit);

		// Print url queue
		@@ -98,6 +99,7 @@ public class crawler {
		private boolean is_username = false; // flag for provided (reddit) username
		private static final int MAXNGRAM = 3;
		private Consumer<String> outputCallback;
		private JProgressBar progressBar;

		public crawler(String file) {
		url_queue = new LinkedList<>();
		@@ -111,8 +113,9 @@ public class crawler {
		// Estimate page count based on compressed file size
		}

		public crawler(String file, Consumer<String> outputCallback) {
		public crawler(String file, Consumer<String> outputCallback, JProgressBar progressBar) {
		this.outputCallback = outputCallback;
		this.progressBar = progressBar;
		url_queue = new LinkedList<>();
		visited_urls = new HashSet<>();

		@@ -139,7 +142,7 @@ public class crawler {
		}
		try {
		System.out.println("Processing: " + cur_site);
		outputCallback.accept("\nProcessing: " + cur_site);
		outputCallback.accept("\n\nProcessing: " + cur_site);
		Document web_data = get_web_data(cur_site);
		if (web_data != null) {
		processPage(web_data, isPost);
		@@ -156,7 +159,9 @@ public class crawler {

		public void build(String language) {
		String corpus = "";
		if (language.equals("English")) {
		if (language.equals("SmallEnglish")) {
		corpus = "brownSmall.txt";
		} else if (language.equals("English")) {
		corpus = "brown.txt";
		} else if (language.equals("German")) {
		corpus = "germanSmall.txt";
		@@ -226,9 +231,14 @@ public class crawler {
		is_username = false;
		Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile
		// Add all posts in overview to url_queue
		for (Element link : posts) {
		for (Element link : posts) { // for links in the text
		String link_url = link.attr("href");
		if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
		// Strip the URL of any anchor tags or query tags
		if (link_url.contains("#"))
		link_url = link_url.split("#")[0];
		if (link_url.contains("?"))
		link_url = link_url.split("\\?")[0]; // ? is a special character in regex
		if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
		foundLinks++;
		url_queue.add(link_url);
		}
		@@ -237,8 +247,13 @@ public class crawler {
		Elements links = web_data.select("a[href]");
		for (Element link : links) {
		String link_url = link.attr("abs:href");
		// Strip the URL of any anchor tags
		if (link_url.contains("#"))
		link_url = link_url.split("#")[0];
		if (link_url.contains("?"))
		link_url = link_url.split("\\?")[0]; // ? is a special character in regex
		// System.out.println("Found link: " + link_url);
		if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
		if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
		// System.out.println("Adding link to queue: " + link_url);
		foundLinks++;
		url_queue.add(link_url);
		@@ -259,7 +274,7 @@ public class crawler {
		// Break the page text into manageable chunks, considering sentences
		List<String> chunks = splitTextIntoChunks(web_data.text());
		if (chunks.isEmpty()) {
		System.out.println("No text found on page.");
		// System.out.println("No text found on page.");
		outputCallback.accept("\nNo text found on page.");
		return;
		}
		@@ -344,17 +359,12 @@ public class crawler {

		// Output sizes of both compressed and uncompressed data for reference
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
		System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
		outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
		outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes, Uncompressed metadata size: " + uncompressedData.length + " bytes");
		// Output rate of processing
		double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0);
		System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
		outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
		// Output # of links found in page
		System.out.println("# of additional links found: " + foundLinks + "\n");
		outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

		compression_size = compressedData.length;
		}