Commit 5c210d2f authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Fix perplexity

parent a6145abc
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -3,3 +3,4 @@
target/
.vscode/
.idea/
*.json
 No newline at end of file
+1 −22
Original line number Diff line number Diff line
https://en.wikipedia.org/wiki/Web_crawler
https://link.springer.com/book/10.1007/978-3-540-46332-0
http://www.youtube.com
http://www.facebook.com
http://www.yahoo.com
http://www.amazon.com
http://www.wikipedia.org
http://www.twitter.com
http://www.live.com
http://www.bing.com
http://www.instagram.com
http://www.linkedin.com
http://www.msn.com
http://www.vk.com
http://www.hao123.com
http://www.reddit.com
http://www.ebay.com
http://www.t.co
http://www.tmall.com
http://www.sohu.com
http://www.pinterest.com
http://www.netflix.com
http://www.microsoft.com
 No newline at end of file
https://en.wikipedia.org/wiki/Wikipedia:Vital_articles
 No newline at end of file
+8 −4
Original line number Diff line number Diff line
@@ -41,17 +41,21 @@ public class TrieNode implements Serializable, Cloneable {
        TrieNode current = this;
        TrieNode past = this;
        float logProb = 0;
        for (String word : phrase.split(" ")) {
        String[] words = phrase.split(" ");
        if (words.length == 1) {
            return 1;
        }
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                // System.out.println("Phrase not found in trie.");
                return Float.MAX_VALUE;
                float alpha = 2;
                return alpha * perplexity(phrase.replaceFirst(words[0] + " ", ""));
            }
            logProb += Math.log((float) current.count / past.childCounts);
        }
        float perplexity = (float) Math.pow(2, -logProb);
        // System.out.println("Perplexity of phrase: " + perplexity);
        //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
        return perplexity;
    }
    
+7 −3
Original line number Diff line number Diff line
@@ -41,7 +41,7 @@ public class crawler {
        }

        // Start crawling
        int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -122,9 +122,13 @@ public class crawler {

        // Break the page text into manageable chunks, considering sentences
        List<String> chunks = splitTextIntoChunks(web_data.text());
        // int chunkCount = 0;
        int chunkCount = 0;
        if (chunks.isEmpty()) {
            System.out.println("No text found on page.");
            return;
        }
        for (String chunk : chunks) {
            // chunkCount++;
            chunkCount++;
            // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
            previousUncompressedData = uncompressedData.clone();
            extractWordUsage(chunk, wordUsage);