Limit page extraction (33e0c1a9) · Commits · EC504 Spring 2024 Group Projects / Group7

src/main/java/edu/bu/LanguageCorrection/TrieNode.java

+20 −2

Original line number	Diff line number	Diff line
		@@ -2,9 +2,9 @@ package edu.bu.LanguageCorrection;

		import java.io.Serializable;
		import java.util.HashMap;
		import java.util.Stack;
		import java.util.Map;

		public class TrieNode implements Serializable {
		public class TrieNode implements Serializable, Cloneable {
		HashMap<String, TrieNode> children = new HashMap<>();
		int count = 0;
		int childCounts = 0;
		@@ -111,4 +111,22 @@ public class TrieNode implements Serializable {
		index[0]++; // Skip the "}" marker
		}
		}

		@Override
		public TrieNode clone() {
		try {
		TrieNode clonedNode = (TrieNode) super.clone();
		clonedNode.children = new HashMap<>();
		for (Map.Entry<String, TrieNode> child : this.children.entrySet()) {
		// Recursively clone and add each child to the cloned node
		clonedNode.children.put(child.getKey(), child.getValue().clone());
		}
		// Count and childCounts are primitive types, so they're already correctly
		// copied by super.clone()
		return clonedNode;
		} catch (CloneNotSupportedException e) {
		// This should not happen since we're Cloneable
		throw new AssertionError(e);
		}
		}
		}

src/main/java/edu/bu/LanguageCorrection/crawler.java

+102 −27

Original line number	Diff line number	Diff line
		@@ -18,6 +18,9 @@ import java.io.FileOutputStream;
		import java.io.FileReader;
		import java.io.IOException;
		import java.util.Arrays;
		import java.util.List;
		import java.util.ArrayList;
		import java.io.File;


		public class crawler {
		@@ -38,7 +41,7 @@ public class crawler {
		}

		// Start crawling
		int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		web_crawler.crawl(crawlLimit);

		// Print visited URLs
		@@ -50,6 +53,7 @@ public class crawler {
		private final HashSet<String> visited_urls;
		private String filePath = "metadata.ser";
		private TrieNode wordUsage = new TrieNode();
		int pageCount = 0;

		public crawler() {
		url_queue = new LinkedList<>();
		@@ -57,6 +61,27 @@ public class crawler {

		// Load trie from file
		wordUsage = loadFile(filePath);

		// Estimate page count based on compressed file size
		pageCount = estimatePageCount(filePath, 1024);
		}

		private int estimatePageCount(String filePath, int avgCompressedSizePerPage) {
		File file = new File(filePath);
		if (!file.exists()) {
		System.out.println("Compressed file does not exist. Starting with page count = 0.");
		return 0;
		}
		long fileSize = file.length();
		System.out.println("Compressed file size: " + fileSize + " bytes");
		int estimatedPages = (int) (fileSize / avgCompressedSizePerPage);
		if (estimatedPages == 0) {
		System.out.println("Estimated number of pages based on compressed file size: " + 1);
		return 1; // At least one page
		} else {
		System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages);
		return estimatedPages;
		}
		}

		public void add_to_queue(String url) {
		@@ -64,7 +89,6 @@ public class crawler {
		}

		public void crawl(int maxPages) {
		int pageCount = 0;
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();
		if (cur_site == null \|\| visited_urls.contains(cur_site)) {
		@@ -73,6 +97,7 @@ public class crawler {
		try {
		Document web_data = get_web_data(cur_site);
		if (web_data != null) {
		pageCount++;
		processPage(web_data);
		Elements links = web_data.select("a[href]");
		for (Element link : links) {
		@@ -82,7 +107,6 @@ public class crawler {
		}
		}
		visited_urls.add(cur_site);
		pageCount++;
		}
		} catch (IOException e) {
		System.err.println("Error processing " + cur_site + ": " + e.getMessage());
		@@ -107,33 +131,84 @@ public class crawler {
		}

		private void processPage(Document web_data) {
		/*
		//Print the text content of the Document
		System.out.println("Text content:");
		System.out.println(web_data.text());
		*/
		// Extract word usage data
		//System.err.println(web_data.text());
		extractWordUsage(web_data.text(), wordUsage);
		byte[] metadata = wordUsage.serialize();
		System.out.println("Metadata extracted successfully.");

		// STORE UNCOMPRESSED DATA //
		System.out.println("Unompressed metadata size: " + metadata.length + " bytes");
		writeToFile(metadata, "uncompressed-"+filePath);

		// COMPRESS DATA //
		byte[] compressedData = compress(metadata);
		if (compressedData.length <= 1024) {
		System.out.println("Successfully compressed metadata within the 1KB limit.");
		byte[] compressedData = new byte[0];
		byte[] uncompressedData = new byte[0];
		boolean sizeLimitExceeded = false;
		TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie
		byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data

		// Break the page text into manageable chunks, considering sentences
		List<String> chunks = splitTextIntoChunks(web_data.text(), 100);

		for (String chunk : chunks) {
		previousWordUsageState = wordUsage.clone();
		previousUncompressedData = uncompressedData.clone();

		// Process each chunk
		extractWordUsage(chunk, wordUsage);
		uncompressedData = wordUsage.serialize();

		// Compress the serialized trie
		compressedData = compress(uncompressedData);

		// Check if compressed data size exceeds 1KB * pageCount
		if (compressedData.length > 1024 * pageCount) {
		System.out.println("Size limit exceeded. Reverting to previous chunk.");
		sizeLimitExceeded = true;
		wordUsage = previousWordUsageState; // Revert to the previous state of the trie
		uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
		compressedData = compress(uncompressedData); // Recompress the reverted state
		break; // Stop processing further chunks
		}
		}

		// Save the uncompressed and compressed data to separate files
		String uncompressedFilePath = "uncompressed-" + filePath;
		writeToFile(uncompressedData, uncompressedFilePath);
		System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath);

		writeToFile(compressedData, filePath);
		if (!sizeLimitExceeded) {
		System.out.println("Compressed tree exported successfully to: " + filePath);
		} else {
		System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization.");
		System.out.println("Compressed data truncated due to size limit.");
		}

		// Output sizes of both compressed and uncompressed data for reference
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
		}

		// STORE COMPRESSED DATA //
		writeToFile(compressedData, filePath);
		System.out.println("Tree exported successfully to: " + filePath);
		private List<String> splitTextIntoChunks(String text, int chunkSize) {
		// Split the text into sentences.
		String[] sentences = text.split("[.!?] ");
		List<String> chunks = new ArrayList<>();
		StringBuilder currentChunk = new StringBuilder();

		for (String sentence : sentences) {
		if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) {
		if (currentChunk.length() > 0) {
		chunks.add(currentChunk.toString());
		currentChunk = new StringBuilder();
		}
		while (sentence.length() > chunkSize) {
		chunks.add(sentence.substring(0, chunkSize));
		sentence = sentence.substring(chunkSize);
		}
		}
		// Add a space before the sentence if it's not the first sentence in the chunk.
		if (currentChunk.length() > 0) {
		currentChunk.append(" ");
		}
		currentChunk.append(sentence);
		}

		// Add the last chunk if it's not empty.
		if (currentChunk.length() > 0) {
		chunks.add(currentChunk.toString());
		}

		return chunks;
		}

target/classes/edu/bu/LanguageCorrection/TrieNode.class

+858 B (5.39 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

target/classes/edu/bu/LanguageCorrection/crawler.class

+2.35 KiB (10.9 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file