Loading .gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -3,3 +3,4 @@ target/ .vscode/ .idea/ *.json No newline at end of file crawler_test_file.txt +1 −22 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://link.springer.com/book/10.1007/978-3-540-46332-0 http://www.youtube.com http://www.facebook.com http://www.yahoo.com http://www.amazon.com http://www.wikipedia.org http://www.twitter.com http://www.live.com http://www.bing.com http://www.instagram.com http://www.linkedin.com http://www.msn.com http://www.vk.com http://www.hao123.com http://www.reddit.com http://www.ebay.com http://www.t.co http://www.tmall.com http://www.sohu.com http://www.pinterest.com http://www.netflix.com http://www.microsoft.com No newline at end of file https://en.wikipedia.org/wiki/Wikipedia:Vital_articles No newline at end of file src/main/java/edu/bu/LanguageCorrection/TrieNode.java +8 −4 Original line number Diff line number Diff line Loading @@ -41,17 +41,21 @@ public class TrieNode implements Serializable, Cloneable { TrieNode current = this; TrieNode past = this; float logProb = 0; for (String word : phrase.split(" ")) { String[] words = phrase.split(" "); if (words.length == 1) { return 1; } for (String word : words) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return Float.MAX_VALUE; float alpha = 2; return alpha * perplexity(phrase.replaceFirst(words[0] + " ", "")); } logProb += Math.log((float) current.count / past.childCounts); } float perplexity = (float) Math.pow(2, -logProb); // System.out.println("Perplexity of phrase: " + perplexity); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); return perplexity; } Loading src/main/java/edu/bu/LanguageCorrection/crawler.java +7 −3 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading Loading @@ -122,9 +122,13 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); // int chunkCount = 0; int chunkCount = 0; if (chunks.isEmpty()) { System.out.println("No text found on page."); return; } for (String chunk : chunks) { // chunkCount++; chunkCount++; // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size()); previousUncompressedData = uncompressedData.clone(); extractWordUsage(chunk, wordUsage); Loading Loading
.gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -3,3 +3,4 @@ target/ .vscode/ .idea/ *.json No newline at end of file
crawler_test_file.txt +1 −22 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://link.springer.com/book/10.1007/978-3-540-46332-0 http://www.youtube.com http://www.facebook.com http://www.yahoo.com http://www.amazon.com http://www.wikipedia.org http://www.twitter.com http://www.live.com http://www.bing.com http://www.instagram.com http://www.linkedin.com http://www.msn.com http://www.vk.com http://www.hao123.com http://www.reddit.com http://www.ebay.com http://www.t.co http://www.tmall.com http://www.sohu.com http://www.pinterest.com http://www.netflix.com http://www.microsoft.com No newline at end of file https://en.wikipedia.org/wiki/Wikipedia:Vital_articles No newline at end of file
src/main/java/edu/bu/LanguageCorrection/TrieNode.java +8 −4 Original line number Diff line number Diff line Loading @@ -41,17 +41,21 @@ public class TrieNode implements Serializable, Cloneable { TrieNode current = this; TrieNode past = this; float logProb = 0; for (String word : phrase.split(" ")) { String[] words = phrase.split(" "); if (words.length == 1) { return 1; } for (String word : words) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return Float.MAX_VALUE; float alpha = 2; return alpha * perplexity(phrase.replaceFirst(words[0] + " ", "")); } logProb += Math.log((float) current.count / past.childCounts); } float perplexity = (float) Math.pow(2, -logProb); // System.out.println("Perplexity of phrase: " + perplexity); //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity); return perplexity; } Loading
src/main/java/edu/bu/LanguageCorrection/crawler.java +7 −3 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading Loading @@ -122,9 +122,13 @@ public class crawler { // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text()); // int chunkCount = 0; int chunkCount = 0; if (chunks.isEmpty()) { System.out.println("No text found on page."); return; } for (String chunk : chunks) { // chunkCount++; chunkCount++; // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size()); previousUncompressedData = uncompressedData.clone(); extractWordUsage(chunk, wordUsage); Loading