Fix serialization and deserialization (6c57da07) · Commits · EC504 Spring 2024 Group Projects / Group7

src/main/java/edu/bu/LanguageCorrection/TrieNode.java

+13 −7

Original line number	Diff line number	Diff line
		@@ -8,6 +8,7 @@ public class TrieNode implements Serializable, Cloneable {
		HashMap<String, TrieNode> children = new HashMap<>();
		int count = 0;
		int childCounts = 0;
		String seperator = " ";

		public void insert(String[] phrase) {
		TrieNode current = this;
		@@ -55,14 +56,14 @@ public class TrieNode implements Serializable, Cloneable {
		}

		public byte[] serialize() {
		return serializeHelper(this).replaceAll(" \\}", "\\}").trim().getBytes();
		return serializeHelper(this).replaceAll(seperator+"\\}"+seperator, "\\}").replaceAll(seperator+"\\{"+seperator, "\\{").trim().getBytes();
		}

		private String serializeHelper(TrieNode node) {
		StringBuilder serialized = new StringBuilder();
		serialized.append(node.count).append(" ").append(node.childCounts);
		serialized.append("\"").append(node.count).append(",").append(node.childCounts).append("\"");
		if (node.childCounts == 0) {
		serialized.append(" ");
		serialized.append(seperator);
		// System.out.println("Serialized node: " + node.count + " " + node.childCounts);
		return serialized.toString();
		} else {
		@@ -70,9 +71,9 @@ public class TrieNode implements Serializable, Cloneable {
		}
		for (String key : node.children.keySet()) {
		// System.out.println(" key: " + key);
		serialized.append(key).append(" ").append(serializeHelper(node.children.get(key)));
		serialized.append("\""+key+"\":").append(seperator).append(serializeHelper(node.children.get(key)));
		}
		serialized.append("}");
		serialized.append("},");
		// System.out.println("Serialized node: " + node.count + " " + node.childCounts);

		return serialized.toString();
		@@ -81,8 +82,9 @@ public class TrieNode implements Serializable, Cloneable {
		public void deserialize(byte[] data) {
		String serializedString = new String(data);
		// Ensure proper spacing around '{' and '}' to correctly parse children
		serializedString = serializedString.replaceAll("\\{", " { ").replaceAll("\\}", " } ").trim();
		String[] parts = serializedString.split("\\s+");
		serializedString = serializedString.replaceAll("\\{", seperator+"\\{"+seperator).replaceAll("\\}", "\\}").trim();
		serializedString = serializedString.replaceAll("\"", "").replaceAll(":", "").replaceAll(",", " ");
		String[] parts = serializedString.split(seperator);
		int[] index = { 0 };

		// Clear current state before rebuilding
		@@ -97,13 +99,17 @@ public class TrieNode implements Serializable, Cloneable {
		}

		// Set the current node's count and childCounts
		// System.out.println(parts[index[0]]);
		node.count = Integer.parseInt(parts[index[0]++]);
		// System.out.println(parts[index[0]]);
		node.childCounts = Integer.parseInt(parts[index[0]++]);
		// System.out.println(parts[index[0]]);

		if ("{".equals(parts[index[0]])) {
		index[0]++; // Move past the "{" marker
		while (!"}".equals(parts[index[0]])) {
		String key = parts[index[0]++];
		// System.out.println("Key: " + key);
		TrieNode child = new TrieNode();
		deserializeHelper(child, parts, index);
		node.children.put(key, child);

src/main/java/edu/bu/LanguageCorrection/crawler.java

+28 −65

Original line number	Diff line number	Diff line
		@@ -20,7 +20,6 @@ import java.io.IOException;
		import java.util.Arrays;
		import java.util.List;
		import java.util.ArrayList;
		import java.io.File;


		public class crawler {
		@@ -32,6 +31,7 @@ public class crawler {
		// Open the file and read all lines (URLs) into the queue
		String file_url = "";
		if (args.length != 0 && args[0].equals("--file")) file_url = args[1];
		if (args.length > 1 && args[2].equals("--debug")) web_crawler.debug = true;
		try (FileReader f_read = new FileReader(file_url);
		BufferedReader buf_read = new BufferedReader(f_read)) {
		String url_line;
		@@ -53,7 +53,8 @@ public class crawler {
		private final HashSet<String> visited_urls;
		private String filePath = "metadata.ser";
		private TrieNode wordUsage = new TrieNode();
		int pageCount = 0;
		private int compression_size = 0;
		private boolean debug = false;

		public crawler() {
		url_queue = new LinkedList<>();
		@@ -63,25 +64,6 @@ public class crawler {
		wordUsage = loadFile(filePath);

		// Estimate page count based on compressed file size
		pageCount = estimatePageCount(filePath, 1024);
		}

		private int estimatePageCount(String filePath, int avgCompressedSizePerPage) {
		File file = new File(filePath);
		if (!file.exists()) {
		System.out.println("Compressed file does not exist. Starting with page count = 0.");
		return 0;
		}
		long fileSize = file.length();
		System.out.println("Compressed file size: " + fileSize + " bytes");
		int estimatedPages = (int) (fileSize / avgCompressedSizePerPage);
		if (estimatedPages == 0) {
		System.out.println("Estimated number of pages based on compressed file size: " + 1);
		return 1; // At least one page
		} else {
		System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages);
		return estimatedPages;
		}
		}

		public void add_to_queue(String url) {
		@@ -89,15 +71,16 @@ public class crawler {
		}

		public void crawl(int maxPages) {
		int pageCount = 0;
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();
		if (cur_site == null \|\| visited_urls.contains(cur_site)) {
		continue;
		}
		try {
		System.out.println("Processing: " + cur_site);
		Document web_data = get_web_data(cur_site);
		if (web_data != null) {
		pageCount++;
		processPage(web_data);
		Elements links = web_data.select("a[href]");
		for (Element link : links) {
		@@ -107,6 +90,7 @@ public class crawler {
		}
		}
		visited_urls.add(cur_site);
		pageCount++;
		}
		} catch (IOException e) {
		System.err.println("Error processing " + cur_site + ": " + e.getMessage());
		@@ -134,28 +118,23 @@ public class crawler {
		byte[] compressedData = new byte[0];
		byte[] uncompressedData = new byte[0];
		boolean sizeLimitExceeded = false;
		TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie
		byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data
		byte[] previousUncompressedData = wordUsage.serialize();

		// Break the page text into manageable chunks, considering sentences
		List<String> chunks = splitTextIntoChunks(web_data.text(), 100);

		List<String> chunks = splitTextIntoChunks(web_data.text());
		// int chunkCount = 0;
		for (String chunk : chunks) {
		previousWordUsageState = wordUsage.clone();
		// chunkCount++;
		// System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
		previousUncompressedData = uncompressedData.clone();

		// Process each chunk
		extractWordUsage(chunk, wordUsage);
		uncompressedData = wordUsage.serialize();

		// Compress the serialized trie
		compressedData = compress(uncompressedData);

		// Check if compressed data size exceeds 1KB * pageCount
		if (compressedData.length > 1024 * pageCount) {
		if (compressedData.length - compression_size > 1024) {
		System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta:"+ (compressedData.length - compression_size) + " bytes.");
		System.out.println("Size limit exceeded. Reverting to previous chunk.");
		sizeLimitExceeded = true;
		wordUsage = previousWordUsageState; // Revert to the previous state of the trie
		uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
		compressedData = compress(uncompressedData); // Recompress the reverted state
		break; // Stop processing further chunks
		@@ -163,9 +142,11 @@ public class crawler {
		}

		// Save the uncompressed and compressed data to separate files
		String uncompressedFilePath = "uncompressed-" + filePath;
		if (debug) {
		String uncompressedFilePath = "uncompressed-" + filePath.replace(".ser", ".json");
		writeToFile(uncompressedData, uncompressedFilePath);
		System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath);
		}

		writeToFile(compressedData, filePath);
		if (!sizeLimitExceeded) {
		@@ -177,41 +158,21 @@ public class crawler {
		// Output sizes of both compressed and uncompressed data for reference
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
		compression_size = compressedData.length;
		}

		private List<String> splitTextIntoChunks(String text, int chunkSize) {
		// Split the text into sentences.
		String[] sentences = text.split("[.!?] ");
		private List<String> splitTextIntoChunks(String text) {
		String[] sentences = text.split("[.!?\n] ");
		List<String> chunks = new ArrayList<>();
		StringBuilder currentChunk = new StringBuilder();

		for (String sentence : sentences) {
		if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) {
		if (currentChunk.length() > 0) {
		chunks.add(currentChunk.toString());
		currentChunk = new StringBuilder();
		}
		while (sentence.length() > chunkSize) {
		chunks.add(sentence.substring(0, chunkSize));
		sentence = sentence.substring(chunkSize);
		}
		}
		// Add a space before the sentence if it's not the first sentence in the chunk.
		if (currentChunk.length() > 0) {
		currentChunk.append(" ");
		}
		currentChunk.append(sentence);
		if (sentence.length() > 100) {
		continue;
		}

		// Add the last chunk if it's not empty.
		if (currentChunk.length() > 0) {
		chunks.add(currentChunk.toString());
		chunks.add(sentence.replaceAll("\\p{Punct}", ""));
		}

		return chunks;
		}


		private static void extractWordUsage(String text, TrieNode trie) {
		// Split text into sentences
		String[] sentences = text.split("[.!?] ");
		@@ -264,12 +225,14 @@ public class crawler {
		}
		}

		private static TrieNode loadFile(String filePath) {
		private TrieNode loadFile(String filePath) {
		TrieNode trie = new TrieNode();
		try (FileInputStream fis = new FileInputStream(filePath)) {
		byte[] compressedData = fis.readAllBytes();
		byte[] decompressedData = decompress(compressedData);
		System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
		compression_size = compressedData.length;
		// System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
		// System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
		trie.deserialize(decompressedData);
		System.out.println("Metadata loaded successfully.");
		return trie;