Fix perplexity (5c210d2f) · Commits · EC504 Spring 2024 Group Projects / Group7

.gitignore

+2 −1

+1 −22

Original line number	Diff line number	Diff line
		https://en.wikipedia.org/wiki/Web_crawler
		https://link.springer.com/book/10.1007/978-3-540-46332-0
		http://www.youtube.com
		http://www.facebook.com
		http://www.yahoo.com
		http://www.amazon.com
		http://www.wikipedia.org
		http://www.twitter.com
		http://www.live.com
		http://www.bing.com
		http://www.instagram.com
		http://www.linkedin.com
		http://www.msn.com
		http://www.vk.com
		http://www.hao123.com
		http://www.reddit.com
		http://www.ebay.com
		http://www.t.co
		http://www.tmall.com
		http://www.sohu.com
		http://www.pinterest.com
		http://www.netflix.com
		http://www.microsoft.com
		No newline at end of file
		https://en.wikipedia.org/wiki/Wikipedia:Vital_articles
		No newline at end of file

+8 −4

Original line number	Diff line number	Diff line
		@@ -41,17 +41,21 @@ public class TrieNode implements Serializable, Cloneable {
		TrieNode current = this;
		TrieNode past = this;
		float logProb = 0;
		for (String word : phrase.split(" ")) {
		String[] words = phrase.split(" ");
		if (words.length == 1) {
		return 1;
		}
		for (String word : words) {
		past = current;
		current = current.children.get(word);
		if (current == null) {
		// System.out.println("Phrase not found in trie.");
		return Float.MAX_VALUE;
		float alpha = 2;
		return alpha * perplexity(phrase.replaceFirst(words[0] + " ", ""));
		}
		logProb += Math.log((float) current.count / past.childCounts);
		}
		float perplexity = (float) Math.pow(2, -logProb);
		// System.out.println("Perplexity of phrase: " + perplexity);
		//System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
		return perplexity;
		}

+7 −3

Original line number	Diff line number	Diff line
		@@ -41,7 +41,7 @@ public class crawler {
		}

		// Start crawling
		int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		web_crawler.crawl(crawlLimit);

		// Print visited URLs
		@@ -122,9 +122,13 @@ public class crawler {

		// Break the page text into manageable chunks, considering sentences
		List<String> chunks = splitTextIntoChunks(web_data.text());
		// int chunkCount = 0;
		int chunkCount = 0;
		if (chunks.isEmpty()) {
		System.out.println("No text found on page.");
		return;
		}
		for (String chunk : chunks) {
		// chunkCount++;
		chunkCount++;
		// System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
		previousUncompressedData = uncompressedData.clone();
		extractWordUsage(chunk, wordUsage);