Loading src/main/java/edu/bu/LanguageCorrection/MainApp.java +2 −2 Original line number Diff line number Diff line Loading @@ -146,7 +146,7 @@ public class MainApp extends JFrame { List<String> lines = Files.readAllLines(Paths.get(input)); lines.forEach(webCrawler::add_to_queue); } catch (Exception e) { resultArea.setText("Error reading file: " + e.getMessage()); resultArea.setText("Error reading file/link. Please make sure to include http or .txt for link or file respectively: " + e.getMessage()); return; } } Loading Loading @@ -324,7 +324,7 @@ public class MainApp extends JFrame { } resultArea.setText(result.toString()); // ISSUE #30 - Feedback for corrector // Add a pop up to input the best correction for each sentence TrieNode node = corrector.getDetector(); boolean changeMade = false; Loading src/main/java/edu/bu/LanguageCorrection/crawler.java +27 −17 Original line number Diff line number Diff line Loading @@ -10,6 +10,9 @@ import org.jsoup.select.Elements; import java.util.*; import java.util.zip.Deflater; import java.util.zip.Inflater; import javax.swing.JProgressBar; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; Loading Loading @@ -56,9 +59,7 @@ public class crawler { } if (!file_url.isEmpty()) { if (web_crawler.is_username) { String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user // page // System.out.println(file_url); String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page web_crawler.add_to_queue(user_url); } else { try (FileReader f_read = new FileReader(file_url); Loading @@ -72,7 +73,7 @@ public class crawler { } // Start crawling final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) final int crawlLimit = 5; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print url queue Loading @@ -98,6 +99,7 @@ public class crawler { private boolean is_username = false; // flag for provided (reddit) username private static final int MAXNGRAM = 3; private Consumer<String> outputCallback; private JProgressBar progressBar; public crawler(String file) { url_queue = new LinkedList<>(); Loading @@ -111,8 +113,9 @@ public class crawler { // Estimate page count based on compressed file size } public crawler(String file, Consumer<String> outputCallback) { public crawler(String file, Consumer<String> outputCallback, JProgressBar progressBar) { this.outputCallback = outputCallback; this.progressBar = progressBar; url_queue = new LinkedList<>(); visited_urls = new HashSet<>(); Loading @@ -139,7 +142,7 @@ public class crawler { } try { System.out.println("Processing: " + cur_site); outputCallback.accept("\nProcessing: " + cur_site); outputCallback.accept("\n\nProcessing: " + cur_site); Document web_data = get_web_data(cur_site); if (web_data != null) { processPage(web_data, isPost); Loading @@ -156,7 +159,9 @@ public class crawler { public void build(String language) { String corpus = ""; if (language.equals("English")) { if (language.equals("SmallEnglish")) { corpus = "brownSmall.txt"; } else if (language.equals("English")) { corpus = "brown.txt"; } else if (language.equals("German")) { corpus = "germanSmall.txt"; Loading Loading @@ -226,9 +231,14 @@ public class crawler { is_username = false; Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile // Add all posts in overview to url_queue for (Element link : posts) { for (Element link : posts) { // for links in the text String link_url = link.attr("href"); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { // Strip the URL of any anchor tags or query tags if (link_url.contains("#")) link_url = link_url.split("#")[0]; if (link_url.contains("?")) link_url = link_url.split("\\?")[0]; // ? is a special character in regex if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { foundLinks++; url_queue.add(link_url); } Loading @@ -237,8 +247,13 @@ public class crawler { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); // Strip the URL of any anchor tags if (link_url.contains("#")) link_url = link_url.split("#")[0]; if (link_url.contains("?")) link_url = link_url.split("\\?")[0]; // ? is a special character in regex // System.out.println("Found link: " + link_url); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { // System.out.println("Adding link to queue: " + link_url); foundLinks++; url_queue.add(link_url); Loading @@ -259,7 +274,7 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); if (chunks.isEmpty()) { System.out.println("No text found on page."); // System.out.println("No text found on page."); outputCallback.accept("\nNo text found on page."); return; } Loading Loading @@ -344,17 +359,12 @@ public class crawler { // Output sizes of both compressed and uncompressed data for reference System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n"); System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes"); outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes"); outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes, Uncompressed metadata size: " + uncompressedData.length + " bytes"); // Output rate of processing double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0); System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second"); outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second"); // Output # of links found in page System.out.println("# of additional links found: " + foundLinks + "\n"); outputCallback.accept("\n# of additional links found: " + foundLinks + "\n"); compression_size = compressedData.length; } Loading Loading
src/main/java/edu/bu/LanguageCorrection/MainApp.java +2 −2 Original line number Diff line number Diff line Loading @@ -146,7 +146,7 @@ public class MainApp extends JFrame { List<String> lines = Files.readAllLines(Paths.get(input)); lines.forEach(webCrawler::add_to_queue); } catch (Exception e) { resultArea.setText("Error reading file: " + e.getMessage()); resultArea.setText("Error reading file/link. Please make sure to include http or .txt for link or file respectively: " + e.getMessage()); return; } } Loading Loading @@ -324,7 +324,7 @@ public class MainApp extends JFrame { } resultArea.setText(result.toString()); // ISSUE #30 - Feedback for corrector // Add a pop up to input the best correction for each sentence TrieNode node = corrector.getDetector(); boolean changeMade = false; Loading
src/main/java/edu/bu/LanguageCorrection/crawler.java +27 −17 Original line number Diff line number Diff line Loading @@ -10,6 +10,9 @@ import org.jsoup.select.Elements; import java.util.*; import java.util.zip.Deflater; import java.util.zip.Inflater; import javax.swing.JProgressBar; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; Loading Loading @@ -56,9 +59,7 @@ public class crawler { } if (!file_url.isEmpty()) { if (web_crawler.is_username) { String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user // page // System.out.println(file_url); String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page web_crawler.add_to_queue(user_url); } else { try (FileReader f_read = new FileReader(file_url); Loading @@ -72,7 +73,7 @@ public class crawler { } // Start crawling final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) final int crawlLimit = 5; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print url queue Loading @@ -98,6 +99,7 @@ public class crawler { private boolean is_username = false; // flag for provided (reddit) username private static final int MAXNGRAM = 3; private Consumer<String> outputCallback; private JProgressBar progressBar; public crawler(String file) { url_queue = new LinkedList<>(); Loading @@ -111,8 +113,9 @@ public class crawler { // Estimate page count based on compressed file size } public crawler(String file, Consumer<String> outputCallback) { public crawler(String file, Consumer<String> outputCallback, JProgressBar progressBar) { this.outputCallback = outputCallback; this.progressBar = progressBar; url_queue = new LinkedList<>(); visited_urls = new HashSet<>(); Loading @@ -139,7 +142,7 @@ public class crawler { } try { System.out.println("Processing: " + cur_site); outputCallback.accept("\nProcessing: " + cur_site); outputCallback.accept("\n\nProcessing: " + cur_site); Document web_data = get_web_data(cur_site); if (web_data != null) { processPage(web_data, isPost); Loading @@ -156,7 +159,9 @@ public class crawler { public void build(String language) { String corpus = ""; if (language.equals("English")) { if (language.equals("SmallEnglish")) { corpus = "brownSmall.txt"; } else if (language.equals("English")) { corpus = "brown.txt"; } else if (language.equals("German")) { corpus = "germanSmall.txt"; Loading Loading @@ -226,9 +231,14 @@ public class crawler { is_username = false; Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile // Add all posts in overview to url_queue for (Element link : posts) { for (Element link : posts) { // for links in the text String link_url = link.attr("href"); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { // Strip the URL of any anchor tags or query tags if (link_url.contains("#")) link_url = link_url.split("#")[0]; if (link_url.contains("?")) link_url = link_url.split("\\?")[0]; // ? is a special character in regex if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { foundLinks++; url_queue.add(link_url); } Loading @@ -237,8 +247,13 @@ public class crawler { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); // Strip the URL of any anchor tags if (link_url.contains("#")) link_url = link_url.split("#")[0]; if (link_url.contains("?")) link_url = link_url.split("\\?")[0]; // ? is a special character in regex // System.out.println("Found link: " + link_url); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { // System.out.println("Adding link to queue: " + link_url); foundLinks++; url_queue.add(link_url); Loading @@ -259,7 +274,7 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); if (chunks.isEmpty()) { System.out.println("No text found on page."); // System.out.println("No text found on page."); outputCallback.accept("\nNo text found on page."); return; } Loading Loading @@ -344,17 +359,12 @@ public class crawler { // Output sizes of both compressed and uncompressed data for reference System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n"); System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes"); outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes"); outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes, Uncompressed metadata size: " + uncompressedData.length + " bytes"); // Output rate of processing double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0); System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second"); outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second"); // Output # of links found in page System.out.println("# of additional links found: " + foundLinks + "\n"); outputCallback.accept("\n# of additional links found: " + foundLinks + "\n"); compression_size = compressedData.length; } Loading