Loading checker_test_file.txt +2 −2 Original line number Diff line number Diff line Sorry for the terrible inconvenience but this site is still under super development. be right back. Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development. No newline at end of file This is unambiguous. This is word odd so choice. Sorry for the inconvenience. Kenya, officially the Republic of Kenya No newline at end of file crawler_test_file.txt +100 −1 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://en.wikipedia.org/wiki/Wikipedia:Vital_articles No newline at end of file https://en.wikipedia.org/wiki/Main_Page https://en.wikipedia.org/wiki/Alliteration https://en.wikipedia.org/wiki/Pun https://en.wikipedia.org/wiki/Encyclop%C3%A6dia_Britannica https://en.wikipedia.org/wiki/Cosmic_latte https://en.wikipedia.org/wiki/Death_from_laughter https://en.wikipedia.org/wiki/Bir_Tawil https://en.wikipedia.org/wiki/M%C3%A4rket https://en.wikipedia.org/wiki/Phineas_Gage https://en.wikipedia.org/wiki/Snow_in_Florida https://en.wikipedia.org/wiki/Cult_following https://en.wikipedia.org/wiki/Ampelm%C3%A4nnchen https://en.wikipedia.org/wiki/All_your_base_are_belong_to_us https://en.wikipedia.org/wiki/Hoax https://en.wikipedia.org/wiki/Sokal_affair https://en.wikipedia.org/wiki/Mary_Toft https://en.wikipedia.org/wiki/Breast-shaped_hill https://en.wikipedia.org/wiki/Folly https://en.wikipedia.org/wiki/Manchester https://en.wikipedia.org/wiki/Eiffel_Tower_replicas_and_derivatives https://en.wikipedia.org/wiki/Gravity_hill https://en.wikipedia.org/wiki/List_of_cities_claimed_to_be_built_on_seven_hills https://en.wikipedia.org/wiki/Seven_hills_of_Rome https://en.wikipedia.org/wiki/List_of_micronations https://en.wikipedia.org/wiki/List_of_tautological_place_names https://en.wikipedia.org/wiki/Truism https://en.wikipedia.org/wiki/Pizza_farm https://en.wikipedia.org/wiki/Recursive_islands_and_lakes https://en.wikipedia.org/wiki/Rocket_garden https://en.wikipedia.org/wiki/Spite_house https://en.wikipedia.org/wiki/Valeriepieris_circle https://en.wikipedia.org/wiki/Lake_Nyos https://en.wikipedia.org/wiki/Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera https://en.wikipedia.org/wiki/Senegal https://en.wikipedia.org/wiki/Akon_City https://en.wikipedia.org/wiki/Akon https://en.wikipedia.org/wiki/Cryptocurrency https://en.wikipedia.org/wiki/Egypt https://en.wikipedia.org/wiki/Sudan https://en.wikipedia.org/wiki/Blue_Desert https://en.wikipedia.org/wiki/Egypt%E2%80%93Israel_peace_treaty https://en.wikipedia.org/wiki/United_Nations https://en.wikipedia.org/wiki/Sinai_Desert https://en.wikipedia.org/wiki/South_Africa https://en.wikipedia.org/wiki/Boulders_Beach https://en.wikipedia.org/wiki/Democratic_Republic_of_the_Congo https://en.wikipedia.org/wiki/Congo_Pedicle https://en.wikipedia.org/wiki/Ethiopia https://en.wikipedia.org/wiki/Dallol_(hydrothermal_system) https://en.wikipedia.org/wiki/Dallol_(ghost_town) https://en.wikipedia.org/wiki/Gaet%27ale_Pond https://en.wikipedia.org/wiki/Saint_Helena https://en.wikipedia.org/wiki/Jacob%27s_Ladder_(Saint_Helena) https://en.wikipedia.org/wiki/Nigeria https://en.wikipedia.org/wiki/Kalakuta_Republic https://en.wikipedia.org/wiki/Fela_Kuti https://en.wikipedia.org/wiki/Supreme_Military_Council_of_Nigeria_(1966%E2%80%931979) https://en.wikipedia.org/wiki/Cameroon https://en.wikipedia.org/wiki/Lake_Nyos_disaster https://en.wikipedia.org/wiki/Lake_Monoun https://en.wikipedia.org/wiki/Mauritania https://en.wikipedia.org/wiki/Mauritania_Railway https://en.wikipedia.org/wiki/Nouadhibou https://en.wikipedia.org/wiki/Iron_ore https://en.wikipedia.org/wiki/Gabon https://en.wikipedia.org/wiki/Oklo_Mine https://en.wikipedia.org/wiki/Angola https://en.wikipedia.org/wiki/Pal%C3%A1cio_de_Ferro https://en.wikipedia.org/wiki/Luanda https://en.wikipedia.org/wiki/Gustave_Eiffel https://en.wikipedia.org/wiki/Eiffel_Tower https://en.wikipedia.org/wiki/Spain https://en.wikipedia.org/wiki/Morocco https://en.wikipedia.org/wiki/Tombolo https://en.wikipedia.org/wiki/2012_Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera_incident https://en.wikipedia.org/wiki/Republic_of_Benin_(1967) https://en.wikipedia.org/wiki/Yemen https://en.wikipedia.org/wiki/Socotra https://en.wikipedia.org/wiki/Dracaena_cinnabari https://en.wikipedia.org/wiki/Dendrosicyos https://en.wikipedia.org/wiki/Ghana https://en.wikipedia.org/wiki/La_Tante_DC10_Restaurant https://en.wikipedia.org/wiki/McDonnell_Douglas_DC-10 https://en.wikipedia.org/wiki/Accra https://en.wikipedia.org/wiki/French_Southern_and_Antarctic_Lands https://en.wikipedia.org/wiki/Tromelin_Island https://en.wikipedia.org/wiki/The_Owl_House_(museum) https://en.wikipedia.org/wiki/The_Owl_House https://en.wikipedia.org/wiki/Kenya https://en.wikipedia.org/wiki/Umoja,_Kenya https://en.wikipedia.org/wiki/Violence_against_women https://en.wikipedia.org/wiki/Samburu_people https://en.wikipedia.org/wiki/Blood_Falls https://en.wikipedia.org/wiki/Ross_Dependency https://en.wikipedia.org/wiki/Iron_oxide https://en.wikipedia.org/wiki/Heard_Island_and_McDonald_Islands https://en.wikipedia.org/wiki/Mawson_Peak https://en.wikipedia.org/wiki/Australia https://en.wikipedia.org/wiki/McMurdo_Dry_Valleys https://en.wikipedia.org/wiki/Don_Juan_Pond No newline at end of file getLinks.py 0 → 100644 +23 −0 Original line number Diff line number Diff line import requests from bs4 import BeautifulSoup def get_wikipedia_article_links(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') links = [] for link in soup.find_all('a', href=True): if '/wiki/' in link['href'] and ':' not in link['href']: full_link = f"https://en.wikipedia.org{link['href']}" if full_link not in links: links.append(full_link) return links # Example: Fetching links from the Unusual Articles page unusual_articles_url = 'https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles' unusual_links = get_wikipedia_article_links(unusual_articles_url) # Print the first 10 links as a sample for link in unusual_links[:100]: print(link) src/main/java/edu/bu/LanguageCorrection/crawler.java +6 −4 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading Loading @@ -85,7 +85,9 @@ public class crawler { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { System.out.println("Found link: " + link_url); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { System.out.println("Adding link to queue: " + link_url); url_queue.add(link_url); } } Loading Loading @@ -122,13 +124,13 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); int chunkCount = 0; // int chunkCount = 0; if (chunks.isEmpty()) { System.out.println("No text found on page."); return; } for (String chunk : chunks) { chunkCount++; // chunkCount++; // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size()); previousUncompressedData = uncompressedData.clone(); extractWordUsage(chunk, wordUsage); Loading Loading
checker_test_file.txt +2 −2 Original line number Diff line number Diff line Sorry for the terrible inconvenience but this site is still under super development. be right back. Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development. No newline at end of file This is unambiguous. This is word odd so choice. Sorry for the inconvenience. Kenya, officially the Republic of Kenya No newline at end of file
crawler_test_file.txt +100 −1 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://en.wikipedia.org/wiki/Wikipedia:Vital_articles No newline at end of file https://en.wikipedia.org/wiki/Main_Page https://en.wikipedia.org/wiki/Alliteration https://en.wikipedia.org/wiki/Pun https://en.wikipedia.org/wiki/Encyclop%C3%A6dia_Britannica https://en.wikipedia.org/wiki/Cosmic_latte https://en.wikipedia.org/wiki/Death_from_laughter https://en.wikipedia.org/wiki/Bir_Tawil https://en.wikipedia.org/wiki/M%C3%A4rket https://en.wikipedia.org/wiki/Phineas_Gage https://en.wikipedia.org/wiki/Snow_in_Florida https://en.wikipedia.org/wiki/Cult_following https://en.wikipedia.org/wiki/Ampelm%C3%A4nnchen https://en.wikipedia.org/wiki/All_your_base_are_belong_to_us https://en.wikipedia.org/wiki/Hoax https://en.wikipedia.org/wiki/Sokal_affair https://en.wikipedia.org/wiki/Mary_Toft https://en.wikipedia.org/wiki/Breast-shaped_hill https://en.wikipedia.org/wiki/Folly https://en.wikipedia.org/wiki/Manchester https://en.wikipedia.org/wiki/Eiffel_Tower_replicas_and_derivatives https://en.wikipedia.org/wiki/Gravity_hill https://en.wikipedia.org/wiki/List_of_cities_claimed_to_be_built_on_seven_hills https://en.wikipedia.org/wiki/Seven_hills_of_Rome https://en.wikipedia.org/wiki/List_of_micronations https://en.wikipedia.org/wiki/List_of_tautological_place_names https://en.wikipedia.org/wiki/Truism https://en.wikipedia.org/wiki/Pizza_farm https://en.wikipedia.org/wiki/Recursive_islands_and_lakes https://en.wikipedia.org/wiki/Rocket_garden https://en.wikipedia.org/wiki/Spite_house https://en.wikipedia.org/wiki/Valeriepieris_circle https://en.wikipedia.org/wiki/Lake_Nyos https://en.wikipedia.org/wiki/Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera https://en.wikipedia.org/wiki/Senegal https://en.wikipedia.org/wiki/Akon_City https://en.wikipedia.org/wiki/Akon https://en.wikipedia.org/wiki/Cryptocurrency https://en.wikipedia.org/wiki/Egypt https://en.wikipedia.org/wiki/Sudan https://en.wikipedia.org/wiki/Blue_Desert https://en.wikipedia.org/wiki/Egypt%E2%80%93Israel_peace_treaty https://en.wikipedia.org/wiki/United_Nations https://en.wikipedia.org/wiki/Sinai_Desert https://en.wikipedia.org/wiki/South_Africa https://en.wikipedia.org/wiki/Boulders_Beach https://en.wikipedia.org/wiki/Democratic_Republic_of_the_Congo https://en.wikipedia.org/wiki/Congo_Pedicle https://en.wikipedia.org/wiki/Ethiopia https://en.wikipedia.org/wiki/Dallol_(hydrothermal_system) https://en.wikipedia.org/wiki/Dallol_(ghost_town) https://en.wikipedia.org/wiki/Gaet%27ale_Pond https://en.wikipedia.org/wiki/Saint_Helena https://en.wikipedia.org/wiki/Jacob%27s_Ladder_(Saint_Helena) https://en.wikipedia.org/wiki/Nigeria https://en.wikipedia.org/wiki/Kalakuta_Republic https://en.wikipedia.org/wiki/Fela_Kuti https://en.wikipedia.org/wiki/Supreme_Military_Council_of_Nigeria_(1966%E2%80%931979) https://en.wikipedia.org/wiki/Cameroon https://en.wikipedia.org/wiki/Lake_Nyos_disaster https://en.wikipedia.org/wiki/Lake_Monoun https://en.wikipedia.org/wiki/Mauritania https://en.wikipedia.org/wiki/Mauritania_Railway https://en.wikipedia.org/wiki/Nouadhibou https://en.wikipedia.org/wiki/Iron_ore https://en.wikipedia.org/wiki/Gabon https://en.wikipedia.org/wiki/Oklo_Mine https://en.wikipedia.org/wiki/Angola https://en.wikipedia.org/wiki/Pal%C3%A1cio_de_Ferro https://en.wikipedia.org/wiki/Luanda https://en.wikipedia.org/wiki/Gustave_Eiffel https://en.wikipedia.org/wiki/Eiffel_Tower https://en.wikipedia.org/wiki/Spain https://en.wikipedia.org/wiki/Morocco https://en.wikipedia.org/wiki/Tombolo https://en.wikipedia.org/wiki/2012_Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera_incident https://en.wikipedia.org/wiki/Republic_of_Benin_(1967) https://en.wikipedia.org/wiki/Yemen https://en.wikipedia.org/wiki/Socotra https://en.wikipedia.org/wiki/Dracaena_cinnabari https://en.wikipedia.org/wiki/Dendrosicyos https://en.wikipedia.org/wiki/Ghana https://en.wikipedia.org/wiki/La_Tante_DC10_Restaurant https://en.wikipedia.org/wiki/McDonnell_Douglas_DC-10 https://en.wikipedia.org/wiki/Accra https://en.wikipedia.org/wiki/French_Southern_and_Antarctic_Lands https://en.wikipedia.org/wiki/Tromelin_Island https://en.wikipedia.org/wiki/The_Owl_House_(museum) https://en.wikipedia.org/wiki/The_Owl_House https://en.wikipedia.org/wiki/Kenya https://en.wikipedia.org/wiki/Umoja,_Kenya https://en.wikipedia.org/wiki/Violence_against_women https://en.wikipedia.org/wiki/Samburu_people https://en.wikipedia.org/wiki/Blood_Falls https://en.wikipedia.org/wiki/Ross_Dependency https://en.wikipedia.org/wiki/Iron_oxide https://en.wikipedia.org/wiki/Heard_Island_and_McDonald_Islands https://en.wikipedia.org/wiki/Mawson_Peak https://en.wikipedia.org/wiki/Australia https://en.wikipedia.org/wiki/McMurdo_Dry_Valleys https://en.wikipedia.org/wiki/Don_Juan_Pond No newline at end of file
getLinks.py 0 → 100644 +23 −0 Original line number Diff line number Diff line import requests from bs4 import BeautifulSoup def get_wikipedia_article_links(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') links = [] for link in soup.find_all('a', href=True): if '/wiki/' in link['href'] and ':' not in link['href']: full_link = f"https://en.wikipedia.org{link['href']}" if full_link not in links: links.append(full_link) return links # Example: Fetching links from the Unusual Articles page unusual_articles_url = 'https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles' unusual_links = get_wikipedia_article_links(unusual_articles_url) # Print the first 10 links as a sample for link in unusual_links[:100]: print(link)
src/main/java/edu/bu/LanguageCorrection/crawler.java +6 −4 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading Loading @@ -85,7 +85,9 @@ public class crawler { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { System.out.println("Found link: " + link_url); if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) { System.out.println("Adding link to queue: " + link_url); url_queue.add(link_url); } } Loading Loading @@ -122,13 +124,13 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); int chunkCount = 0; // int chunkCount = 0; if (chunks.isEmpty()) { System.out.println("No text found on page."); return; } for (String chunk : chunks) { chunkCount++; // chunkCount++; // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size()); previousUncompressedData = uncompressedData.clone(); extractWordUsage(chunk, wordUsage); Loading