Loading Crawler/src/main/java/org/example/crawler.java +69 −10 Original line number Diff line number Diff line Loading @@ -19,8 +19,60 @@ import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; public class crawler { static class TrieNode { HashMap<String, TrieNode> children = new HashMap<>(); int count = 0; int childCounts = 0; public void insert(String[] phrase) { TrieNode current = this; TrieNode past = this; // Store the previous node for (String word : phrase) { past = current; current = current.children.computeIfAbsent(word, c -> new TrieNode()); } current.count += 1; past.childCounts += 1; } public float probability(String phrase) { TrieNode current = this; TrieNode past = this; for (String word : phrase.split(" ")) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return 0; } } // System.out.println("Probability of phrase: " + (double) current.count / past.childCounts); return (float) current.count / past.childCounts; } public float perplexity(String phrase) { TrieNode current = this; TrieNode past = this; float logProb = 0; for (String word : phrase.split(" ")) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return Float.MAX_VALUE; } logProb += Math.log((float) current.count / past.childCounts); } float perplexity = (float) Math.pow(2, -logProb); // System.out.println("Perplexity of phrase: " + perplexity); return perplexity; } } public static void main(String[] args) throws IOException { // Initialize web crawler crawler web_crawler = new crawler(); Loading @@ -47,6 +99,7 @@ public class crawler { //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; private final TrieNode wordUsage = new TrieNode(); public crawler() { url_queue = new LinkedList<>(); Loading Loading @@ -108,7 +161,7 @@ public class crawler { */ // Extract word usage data Map<String, Integer> wordUsage = extractWordUsage(web_data.text()); extractWordUsage(web_data.text(), wordUsage); String metadata = serializeWordUsage(wordUsage); try { Loading @@ -131,19 +184,25 @@ public class crawler { // STORE COMPRESSED DATA // } private Map<String, Integer> extractWordUsage(String text) { Map<String, Integer> wordCount = new HashMap<>(); // Split by whitespace and count occurrences for (String word : text.split("\\s+")) { wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); private void extractWordUsage(String text, TrieNode trie) { // Split text into sentences String[] sentences = text.split("[.!?]"); for (String sentence : sentences) { for (int nGram = 1; nGram <= 2; nGram++) { String[] words = sentence.split("\\s+"); for (int i = 0; i < words.length - nGram + 1; i++) { trie.insert(Arrays.copyOfRange(words, i, i + nGram)); } return wordCount; } } } private String serializeWordUsage(Map<String, Integer> wordUsage) { private String serializeWordUsage(TrieNode wordUsage) { // Convert word usage to string StringBuilder builder = new StringBuilder(); for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) { for (Map.Entry<String, TrieNode> entry : wordUsage.children.entrySet()) { builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";"); } return builder.toString(); Loading Loading
Crawler/src/main/java/org/example/crawler.java +69 −10 Original line number Diff line number Diff line Loading @@ -19,8 +19,60 @@ import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; public class crawler { static class TrieNode { HashMap<String, TrieNode> children = new HashMap<>(); int count = 0; int childCounts = 0; public void insert(String[] phrase) { TrieNode current = this; TrieNode past = this; // Store the previous node for (String word : phrase) { past = current; current = current.children.computeIfAbsent(word, c -> new TrieNode()); } current.count += 1; past.childCounts += 1; } public float probability(String phrase) { TrieNode current = this; TrieNode past = this; for (String word : phrase.split(" ")) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return 0; } } // System.out.println("Probability of phrase: " + (double) current.count / past.childCounts); return (float) current.count / past.childCounts; } public float perplexity(String phrase) { TrieNode current = this; TrieNode past = this; float logProb = 0; for (String word : phrase.split(" ")) { past = current; current = current.children.get(word); if (current == null) { // System.out.println("Phrase not found in trie."); return Float.MAX_VALUE; } logProb += Math.log((float) current.count / past.childCounts); } float perplexity = (float) Math.pow(2, -logProb); // System.out.println("Perplexity of phrase: " + perplexity); return perplexity; } } public static void main(String[] args) throws IOException { // Initialize web crawler crawler web_crawler = new crawler(); Loading @@ -47,6 +99,7 @@ public class crawler { //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; private final TrieNode wordUsage = new TrieNode(); public crawler() { url_queue = new LinkedList<>(); Loading Loading @@ -108,7 +161,7 @@ public class crawler { */ // Extract word usage data Map<String, Integer> wordUsage = extractWordUsage(web_data.text()); extractWordUsage(web_data.text(), wordUsage); String metadata = serializeWordUsage(wordUsage); try { Loading @@ -131,19 +184,25 @@ public class crawler { // STORE COMPRESSED DATA // } private Map<String, Integer> extractWordUsage(String text) { Map<String, Integer> wordCount = new HashMap<>(); // Split by whitespace and count occurrences for (String word : text.split("\\s+")) { wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); private void extractWordUsage(String text, TrieNode trie) { // Split text into sentences String[] sentences = text.split("[.!?]"); for (String sentence : sentences) { for (int nGram = 1; nGram <= 2; nGram++) { String[] words = sentence.split("\\s+"); for (int i = 0; i < words.length - nGram + 1; i++) { trie.insert(Arrays.copyOfRange(words, i, i + nGram)); } return wordCount; } } } private String serializeWordUsage(Map<String, Integer> wordUsage) { private String serializeWordUsage(TrieNode wordUsage) { // Convert word usage to string StringBuilder builder = new StringBuilder(); for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) { for (Map.Entry<String, TrieNode> entry : wordUsage.children.entrySet()) { builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";"); } return builder.toString(); Loading