Commit 2c34f35c authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

update crawler functionality to be Trie based

parent f13ca7e6
Loading
Loading
Loading
Loading
+69 −10
Original line number Diff line number Diff line
@@ -19,8 +19,60 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

public class crawler {

    static class TrieNode {
        HashMap<String, TrieNode> children = new HashMap<>();
        int count = 0;
        int childCounts = 0;

        public void insert(String[] phrase) {
            TrieNode current = this;
            TrieNode past = this; // Store the previous node
            for (String word : phrase) {
                past = current;
                current = current.children.computeIfAbsent(word, c -> new TrieNode());
            }
            current.count += 1;
            past.childCounts += 1;
        }

        public float probability(String phrase) {
            TrieNode current = this;
            TrieNode past = this;
            for (String word : phrase.split(" ")) {
                past = current;
                current = current.children.get(word);
                if (current == null) {
                    // System.out.println("Phrase not found in trie.");
                    return 0;
                }
            }
            // System.out.println("Probability of phrase: " + (double) current.count / past.childCounts);
            return (float) current.count / past.childCounts;
        }

        public float perplexity(String phrase) {
            TrieNode current = this;
            TrieNode past = this;
            float logProb = 0;
            for (String word : phrase.split(" ")) {
                past = current;
                current = current.children.get(word);
                if (current == null) {
                    // System.out.println("Phrase not found in trie.");
                    return Float.MAX_VALUE;
                }
                logProb += Math.log((float) current.count / past.childCounts);
            }
            float perplexity = (float) Math.pow(2, -logProb);
            // System.out.println("Perplexity of phrase: " + perplexity);
            return perplexity;
        }
    }

    public static void main(String[] args) throws IOException {
        // Initialize web crawler
        crawler web_crawler = new crawler();
@@ -47,6 +99,7 @@ public class crawler {
    //members
    private final LinkedList<String> url_queue;
    private final HashSet<String> visited_urls;
    private final TrieNode wordUsage = new TrieNode();

    public crawler() {
        url_queue = new LinkedList<>();
@@ -108,7 +161,7 @@ public class crawler {
        */

        // Extract word usage data
        Map<String, Integer> wordUsage = extractWordUsage(web_data.text());
        extractWordUsage(web_data.text(), wordUsage);
        String metadata = serializeWordUsage(wordUsage);

        try {
@@ -131,19 +184,25 @@ public class crawler {
        // STORE COMPRESSED DATA //
    }

    private Map<String, Integer> extractWordUsage(String text) {
        Map<String, Integer> wordCount = new HashMap<>();
        // Split by whitespace and count occurrences
        for (String word : text.split("\\s+")) {
            wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
    private void extractWordUsage(String text, TrieNode trie) {
        // Split text into sentences
        String[] sentences = text.split("[.!?]");
        for (String sentence : sentences) {
            for (int nGram = 1; nGram <= 2; nGram++) {
                String[] words = sentence.split("\\s+");
                for (int i = 0; i < words.length - nGram + 1; i++) {
                    trie.insert(Arrays.copyOfRange(words, i, i + nGram));
                }
        return wordCount;
            }
        }


    }

    private String serializeWordUsage(Map<String, Integer> wordUsage) {
    private String serializeWordUsage(TrieNode wordUsage) {
        // Convert word usage to string
        StringBuilder builder = new StringBuilder();
        for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) {
        for (Map.Entry<String, TrieNode> entry : wordUsage.children.entrySet()) {
            builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";");
        }
        return builder.toString();