Loading src/main/java/edu/bu/LanguageCorrection/TrieNode.java +20 −2 Original line number Diff line number Diff line Loading @@ -2,9 +2,9 @@ package edu.bu.LanguageCorrection; import java.io.Serializable; import java.util.HashMap; import java.util.Stack; import java.util.Map; public class TrieNode implements Serializable { public class TrieNode implements Serializable, Cloneable { HashMap<String, TrieNode> children = new HashMap<>(); int count = 0; int childCounts = 0; Loading Loading @@ -111,4 +111,22 @@ public class TrieNode implements Serializable { index[0]++; // Skip the "}" marker } } @Override public TrieNode clone() { try { TrieNode clonedNode = (TrieNode) super.clone(); clonedNode.children = new HashMap<>(); for (Map.Entry<String, TrieNode> child : this.children.entrySet()) { // Recursively clone and add each child to the cloned node clonedNode.children.put(child.getKey(), child.getValue().clone()); } // Count and childCounts are primitive types, so they're already correctly // copied by super.clone() return clonedNode; } catch (CloneNotSupportedException e) { // This should not happen since we're Cloneable throw new AssertionError(e); } } } src/main/java/edu/bu/LanguageCorrection/crawler.java +102 −27 Original line number Diff line number Diff line Loading @@ -18,6 +18,9 @@ import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.ArrayList; import java.io.File; public class crawler { Loading @@ -38,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading @@ -50,6 +53,7 @@ public class crawler { private final HashSet<String> visited_urls; private String filePath = "metadata.ser"; private TrieNode wordUsage = new TrieNode(); int pageCount = 0; public crawler() { url_queue = new LinkedList<>(); Loading @@ -57,6 +61,27 @@ public class crawler { // Load trie from file wordUsage = loadFile(filePath); // Estimate page count based on compressed file size pageCount = estimatePageCount(filePath, 1024); } private int estimatePageCount(String filePath, int avgCompressedSizePerPage) { File file = new File(filePath); if (!file.exists()) { System.out.println("Compressed file does not exist. Starting with page count = 0."); return 0; } long fileSize = file.length(); System.out.println("Compressed file size: " + fileSize + " bytes"); int estimatedPages = (int) (fileSize / avgCompressedSizePerPage); if (estimatedPages == 0) { System.out.println("Estimated number of pages based on compressed file size: " + 1); return 1; // At least one page } else { System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages); return estimatedPages; } } public void add_to_queue(String url) { Loading @@ -64,7 +89,6 @@ public class crawler { } public void crawl(int maxPages) { int pageCount = 0; while (!url_queue.isEmpty() && pageCount < maxPages) { String cur_site = url_queue.poll(); if (cur_site == null || visited_urls.contains(cur_site)) { Loading @@ -73,6 +97,7 @@ public class crawler { try { Document web_data = get_web_data(cur_site); if (web_data != null) { pageCount++; processPage(web_data); Elements links = web_data.select("a[href]"); for (Element link : links) { Loading @@ -82,7 +107,6 @@ public class crawler { } } visited_urls.add(cur_site); pageCount++; } } catch (IOException e) { System.err.println("Error processing " + cur_site + ": " + e.getMessage()); Loading @@ -107,33 +131,84 @@ public class crawler { } private void processPage(Document web_data) { /* //Print the text content of the Document System.out.println("Text content:"); System.out.println(web_data.text()); */ // Extract word usage data //System.err.println(web_data.text()); extractWordUsage(web_data.text(), wordUsage); byte[] metadata = wordUsage.serialize(); System.out.println("Metadata extracted successfully."); // STORE UNCOMPRESSED DATA // System.out.println("Unompressed metadata size: " + metadata.length + " bytes"); writeToFile(metadata, "uncompressed-"+filePath); // COMPRESS DATA // byte[] compressedData = compress(metadata); if (compressedData.length <= 1024) { System.out.println("Successfully compressed metadata within the 1KB limit."); byte[] compressedData = new byte[0]; byte[] uncompressedData = new byte[0]; boolean sizeLimitExceeded = false; TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text(), 100); for (String chunk : chunks) { previousWordUsageState = wordUsage.clone(); previousUncompressedData = uncompressedData.clone(); // Process each chunk extractWordUsage(chunk, wordUsage); uncompressedData = wordUsage.serialize(); // Compress the serialized trie compressedData = compress(uncompressedData); // Check if compressed data size exceeds 1KB * pageCount if (compressedData.length > 1024 * pageCount) { System.out.println("Size limit exceeded. Reverting to previous chunk."); sizeLimitExceeded = true; wordUsage = previousWordUsageState; // Revert to the previous state of the trie uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data compressedData = compress(uncompressedData); // Recompress the reverted state break; // Stop processing further chunks } } // Save the uncompressed and compressed data to separate files String uncompressedFilePath = "uncompressed-" + filePath; writeToFile(uncompressedData, uncompressedFilePath); System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath); writeToFile(compressedData, filePath); if (!sizeLimitExceeded) { System.out.println("Compressed tree exported successfully to: " + filePath); } else { System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization."); System.out.println("Compressed data truncated due to size limit."); } // Output sizes of both compressed and uncompressed data for reference System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes"); } // STORE COMPRESSED DATA // writeToFile(compressedData, filePath); System.out.println("Tree exported successfully to: " + filePath); private List<String> splitTextIntoChunks(String text, int chunkSize) { // Split the text into sentences. String[] sentences = text.split("[.!?] "); List<String> chunks = new ArrayList<>(); StringBuilder currentChunk = new StringBuilder(); for (String sentence : sentences) { if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) { if (currentChunk.length() > 0) { chunks.add(currentChunk.toString()); currentChunk = new StringBuilder(); } while (sentence.length() > chunkSize) { chunks.add(sentence.substring(0, chunkSize)); sentence = sentence.substring(chunkSize); } } // Add a space before the sentence if it's not the first sentence in the chunk. if (currentChunk.length() > 0) { currentChunk.append(" "); } currentChunk.append(sentence); } // Add the last chunk if it's not empty. if (currentChunk.length() > 0) { chunks.add(currentChunk.toString()); } return chunks; } Loading target/classes/edu/bu/LanguageCorrection/TrieNode.class +858 B (5.39 KiB) File changed.No diff preview for this file type. View original file View changed file target/classes/edu/bu/LanguageCorrection/crawler.class +2.35 KiB (10.9 KiB) File changed.No diff preview for this file type. View original file View changed file Loading
src/main/java/edu/bu/LanguageCorrection/TrieNode.java +20 −2 Original line number Diff line number Diff line Loading @@ -2,9 +2,9 @@ package edu.bu.LanguageCorrection; import java.io.Serializable; import java.util.HashMap; import java.util.Stack; import java.util.Map; public class TrieNode implements Serializable { public class TrieNode implements Serializable, Cloneable { HashMap<String, TrieNode> children = new HashMap<>(); int count = 0; int childCounts = 0; Loading Loading @@ -111,4 +111,22 @@ public class TrieNode implements Serializable { index[0]++; // Skip the "}" marker } } @Override public TrieNode clone() { try { TrieNode clonedNode = (TrieNode) super.clone(); clonedNode.children = new HashMap<>(); for (Map.Entry<String, TrieNode> child : this.children.entrySet()) { // Recursively clone and add each child to the cloned node clonedNode.children.put(child.getKey(), child.getValue().clone()); } // Count and childCounts are primitive types, so they're already correctly // copied by super.clone() return clonedNode; } catch (CloneNotSupportedException e) { // This should not happen since we're Cloneable throw new AssertionError(e); } } }
src/main/java/edu/bu/LanguageCorrection/crawler.java +102 −27 Original line number Diff line number Diff line Loading @@ -18,6 +18,9 @@ import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.ArrayList; import java.io.File; public class crawler { Loading @@ -38,7 +41,7 @@ public class crawler { } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs Loading @@ -50,6 +53,7 @@ public class crawler { private final HashSet<String> visited_urls; private String filePath = "metadata.ser"; private TrieNode wordUsage = new TrieNode(); int pageCount = 0; public crawler() { url_queue = new LinkedList<>(); Loading @@ -57,6 +61,27 @@ public class crawler { // Load trie from file wordUsage = loadFile(filePath); // Estimate page count based on compressed file size pageCount = estimatePageCount(filePath, 1024); } private int estimatePageCount(String filePath, int avgCompressedSizePerPage) { File file = new File(filePath); if (!file.exists()) { System.out.println("Compressed file does not exist. Starting with page count = 0."); return 0; } long fileSize = file.length(); System.out.println("Compressed file size: " + fileSize + " bytes"); int estimatedPages = (int) (fileSize / avgCompressedSizePerPage); if (estimatedPages == 0) { System.out.println("Estimated number of pages based on compressed file size: " + 1); return 1; // At least one page } else { System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages); return estimatedPages; } } public void add_to_queue(String url) { Loading @@ -64,7 +89,6 @@ public class crawler { } public void crawl(int maxPages) { int pageCount = 0; while (!url_queue.isEmpty() && pageCount < maxPages) { String cur_site = url_queue.poll(); if (cur_site == null || visited_urls.contains(cur_site)) { Loading @@ -73,6 +97,7 @@ public class crawler { try { Document web_data = get_web_data(cur_site); if (web_data != null) { pageCount++; processPage(web_data); Elements links = web_data.select("a[href]"); for (Element link : links) { Loading @@ -82,7 +107,6 @@ public class crawler { } } visited_urls.add(cur_site); pageCount++; } } catch (IOException e) { System.err.println("Error processing " + cur_site + ": " + e.getMessage()); Loading @@ -107,33 +131,84 @@ public class crawler { } private void processPage(Document web_data) { /* //Print the text content of the Document System.out.println("Text content:"); System.out.println(web_data.text()); */ // Extract word usage data //System.err.println(web_data.text()); extractWordUsage(web_data.text(), wordUsage); byte[] metadata = wordUsage.serialize(); System.out.println("Metadata extracted successfully."); // STORE UNCOMPRESSED DATA // System.out.println("Unompressed metadata size: " + metadata.length + " bytes"); writeToFile(metadata, "uncompressed-"+filePath); // COMPRESS DATA // byte[] compressedData = compress(metadata); if (compressedData.length <= 1024) { System.out.println("Successfully compressed metadata within the 1KB limit."); byte[] compressedData = new byte[0]; byte[] uncompressedData = new byte[0]; boolean sizeLimitExceeded = false; TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data // Break the page text into manageable chunks, considering sentences List<String> chunks = splitTextIntoChunks(web_data.text(), 100); for (String chunk : chunks) { previousWordUsageState = wordUsage.clone(); previousUncompressedData = uncompressedData.clone(); // Process each chunk extractWordUsage(chunk, wordUsage); uncompressedData = wordUsage.serialize(); // Compress the serialized trie compressedData = compress(uncompressedData); // Check if compressed data size exceeds 1KB * pageCount if (compressedData.length > 1024 * pageCount) { System.out.println("Size limit exceeded. Reverting to previous chunk."); sizeLimitExceeded = true; wordUsage = previousWordUsageState; // Revert to the previous state of the trie uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data compressedData = compress(uncompressedData); // Recompress the reverted state break; // Stop processing further chunks } } // Save the uncompressed and compressed data to separate files String uncompressedFilePath = "uncompressed-" + filePath; writeToFile(uncompressedData, uncompressedFilePath); System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath); writeToFile(compressedData, filePath); if (!sizeLimitExceeded) { System.out.println("Compressed tree exported successfully to: " + filePath); } else { System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization."); System.out.println("Compressed data truncated due to size limit."); } // Output sizes of both compressed and uncompressed data for reference System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes"); } // STORE COMPRESSED DATA // writeToFile(compressedData, filePath); System.out.println("Tree exported successfully to: " + filePath); private List<String> splitTextIntoChunks(String text, int chunkSize) { // Split the text into sentences. String[] sentences = text.split("[.!?] "); List<String> chunks = new ArrayList<>(); StringBuilder currentChunk = new StringBuilder(); for (String sentence : sentences) { if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) { if (currentChunk.length() > 0) { chunks.add(currentChunk.toString()); currentChunk = new StringBuilder(); } while (sentence.length() > chunkSize) { chunks.add(sentence.substring(0, chunkSize)); sentence = sentence.substring(chunkSize); } } // Add a space before the sentence if it's not the first sentence in the chunk. if (currentChunk.length() > 0) { currentChunk.append(" "); } currentChunk.append(sentence); } // Add the last chunk if it's not empty. if (currentChunk.length() > 0) { chunks.add(currentChunk.toString()); } return chunks; } Loading
target/classes/edu/bu/LanguageCorrection/TrieNode.class +858 B (5.39 KiB) File changed.No diff preview for this file type. View original file View changed file
target/classes/edu/bu/LanguageCorrection/crawler.class +2.35 KiB (10.9 KiB) File changed.No diff preview for this file type. View original file View changed file