Commit 6c57da07 authored by Moises Bensadon's avatar Moises Bensadon
Browse files

Fix serialization and deserialization

parent d5046e6d
Loading
Loading
Loading
Loading
+13 −7
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ public class TrieNode implements Serializable, Cloneable {
    HashMap<String, TrieNode> children = new HashMap<>();
    int count = 0;
    int childCounts = 0;
    String seperator = " ";

    public void insert(String[] phrase) {
        TrieNode current = this;
@@ -55,14 +56,14 @@ public class TrieNode implements Serializable, Cloneable {
    }
    
    public byte[] serialize() {
        return serializeHelper(this).replaceAll(" \\}", "\\}").trim().getBytes();
        return serializeHelper(this).replaceAll(seperator+"\\}"+seperator, "\\}").replaceAll(seperator+"\\{"+seperator, "\\{").trim().getBytes();
    }

    private String serializeHelper(TrieNode node) {
        StringBuilder serialized = new StringBuilder();
        serialized.append(node.count).append(" ").append(node.childCounts);
        serialized.append("\"").append(node.count).append(",").append(node.childCounts).append("\"");
        if (node.childCounts == 0) {
            serialized.append(" ");
            serialized.append(seperator);
            // System.out.println("Serialized node: " + node.count + " " + node.childCounts);
            return serialized.toString();
        } else {
@@ -70,9 +71,9 @@ public class TrieNode implements Serializable, Cloneable {
        }
        for (String key : node.children.keySet()) {
            // System.out.println(" key: " + key);
            serialized.append(key).append(" ").append(serializeHelper(node.children.get(key)));
            serialized.append("\""+key+"\":").append(seperator).append(serializeHelper(node.children.get(key)));
        }
        serialized.append("}");
        serialized.append("},");
        // System.out.println("Serialized node: " + node.count + " " + node.childCounts);

        return serialized.toString();
@@ -81,8 +82,9 @@ public class TrieNode implements Serializable, Cloneable {
    public void deserialize(byte[] data) {
        String serializedString = new String(data);
        // Ensure proper spacing around '{' and '}' to correctly parse children
        serializedString = serializedString.replaceAll("\\{", " { ").replaceAll("\\}", " } ").trim();
        String[] parts = serializedString.split("\\s+");
        serializedString = serializedString.replaceAll("\\{", seperator+"\\{"+seperator).replaceAll("\\}", "\\}").trim();
        serializedString = serializedString.replaceAll("\"", "").replaceAll(":", "").replaceAll(",", " ");
        String[] parts = serializedString.split(seperator);
        int[] index = { 0 };

        // Clear current state before rebuilding
@@ -97,13 +99,17 @@ public class TrieNode implements Serializable, Cloneable {
        }

        // Set the current node's count and childCounts
        // System.out.println(parts[index[0]]);
        node.count = Integer.parseInt(parts[index[0]++]);
        // System.out.println(parts[index[0]]);
        node.childCounts = Integer.parseInt(parts[index[0]++]);
        // System.out.println(parts[index[0]]);

        if ("{".equals(parts[index[0]])) {
            index[0]++; // Move past the "{" marker
            while (!"}".equals(parts[index[0]])) {
                String key = parts[index[0]++];
                // System.out.println("Key: " + key);
                TrieNode child = new TrieNode();
                deserializeHelper(child, parts, index);
                node.children.put(key, child);
+28 −65
Original line number Diff line number Diff line
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.io.File;


public class crawler {
@@ -32,6 +31,7 @@ public class crawler {
        // Open the file and read all lines (URLs) into the queue
        String file_url = "";
        if (args.length != 0 && args[0].equals("--file")) file_url = args[1];
        if (args.length > 1 && args[2].equals("--debug")) web_crawler.debug = true;
        try (FileReader f_read = new FileReader(file_url);
             BufferedReader buf_read = new BufferedReader(f_read)) {
            String url_line;
@@ -53,7 +53,8 @@ public class crawler {
    private final HashSet<String> visited_urls;
    private String filePath = "metadata.ser";
    private TrieNode wordUsage = new TrieNode();
    int pageCount = 0;
    private int compression_size = 0;
    private boolean debug = false;

    public crawler() {
        url_queue = new LinkedList<>();
@@ -63,25 +64,6 @@ public class crawler {
        wordUsage = loadFile(filePath);

        // Estimate page count based on compressed file size
        pageCount = estimatePageCount(filePath, 1024);
    }

    private int estimatePageCount(String filePath, int avgCompressedSizePerPage) {
        File file = new File(filePath);
        if (!file.exists()) {
            System.out.println("Compressed file does not exist. Starting with page count = 0.");
            return 0;
        }
        long fileSize = file.length();
        System.out.println("Compressed file size: " + fileSize + " bytes");
        int estimatedPages = (int) (fileSize / avgCompressedSizePerPage);
        if (estimatedPages == 0) {
            System.out.println("Estimated number of pages based on compressed file size: " + 1);
            return 1; // At least one page
        } else {
            System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages);
            return estimatedPages;
        }
    }

    public void add_to_queue(String url) {
@@ -89,15 +71,16 @@ public class crawler {
    }

    public void crawl(int maxPages) {
        int pageCount = 0;
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();
            if (cur_site == null || visited_urls.contains(cur_site)) {
                continue;
            }
            try {
                System.out.println("Processing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    pageCount++;
                    processPage(web_data);
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
@@ -107,6 +90,7 @@ public class crawler {
                        }
                    }
                    visited_urls.add(cur_site);
                    pageCount++;
                }
            } catch (IOException e) {
                System.err.println("Error processing " + cur_site + ": " + e.getMessage());
@@ -134,28 +118,23 @@ public class crawler {
        byte[] compressedData = new byte[0];
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
        TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie
        byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data
        byte[] previousUncompressedData = wordUsage.serialize();

        // Break the page text into manageable chunks, considering sentences
        List<String> chunks = splitTextIntoChunks(web_data.text(), 100);

        List<String> chunks = splitTextIntoChunks(web_data.text());
        // int chunkCount = 0;
        for (String chunk : chunks) {
            previousWordUsageState = wordUsage.clone();
            // chunkCount++;
            // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
            previousUncompressedData = uncompressedData.clone();

            // Process each chunk
            extractWordUsage(chunk, wordUsage);
            uncompressedData = wordUsage.serialize();

            // Compress the serialized trie
            compressedData = compress(uncompressedData);

            // Check if compressed data size exceeds 1KB * pageCount
            if (compressedData.length > 1024 * pageCount) {
            if (compressedData.length - compression_size > 1024) {
                System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta:"+ (compressedData.length - compression_size) + " bytes.");
                System.out.println("Size limit exceeded. Reverting to previous chunk.");
                sizeLimitExceeded = true;
                wordUsage = previousWordUsageState; // Revert to the previous state of the trie
                uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
                compressedData = compress(uncompressedData); // Recompress the reverted state
                break; // Stop processing further chunks
@@ -163,9 +142,11 @@ public class crawler {
        }

        // Save the uncompressed and compressed data to separate files
        String uncompressedFilePath = "uncompressed-" + filePath;
        if (debug) {
            String uncompressedFilePath = "uncompressed-" + filePath.replace(".ser", ".json");
            writeToFile(uncompressedData, uncompressedFilePath);
            System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath);
        }

        writeToFile(compressedData, filePath);
        if (!sizeLimitExceeded) {
@@ -177,41 +158,21 @@ public class crawler {
        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        compression_size = compressedData.length;
    }

    private List<String> splitTextIntoChunks(String text, int chunkSize) {
        // Split the text into sentences.
        String[] sentences = text.split("[.!?] ");
    private List<String> splitTextIntoChunks(String text) {
        String[] sentences = text.split("[.!?\n] ");
        List<String> chunks = new ArrayList<>();
        StringBuilder currentChunk = new StringBuilder();

        for (String sentence : sentences) {
            if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) {
                if (currentChunk.length() > 0) {
                    chunks.add(currentChunk.toString());
                    currentChunk = new StringBuilder();
                }
                while (sentence.length() > chunkSize) {
                    chunks.add(sentence.substring(0, chunkSize));
                    sentence = sentence.substring(chunkSize);
                }
            }
            // Add a space before the sentence if it's not the first sentence in the chunk.
            if (currentChunk.length() > 0) {
                currentChunk.append(" ");
            }
            currentChunk.append(sentence);
            if (sentence.length() > 100) {
                continue;
            }

        // Add the last chunk if it's not empty.
        if (currentChunk.length() > 0) {
            chunks.add(currentChunk.toString());
            chunks.add(sentence.replaceAll("\\p{Punct}", ""));
        }

        return chunks;
    }


    private static void extractWordUsage(String text, TrieNode trie) {
        // Split text into sentences
        String[] sentences = text.split("[.!?] ");
@@ -264,12 +225,14 @@ public class crawler {
        }
    }

    private static TrieNode loadFile(String filePath) {
    private TrieNode loadFile(String filePath) {
        TrieNode trie = new TrieNode();
        try (FileInputStream fis = new FileInputStream(filePath)) {
            byte[] compressedData = fis.readAllBytes();
            byte[] decompressedData = decompress(compressedData);
            System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
            compression_size = compressedData.length;
            // System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
            // System.out.println("Decompressed metadata size: " + decompressedData.length + " bytes");
            trie.deserialize(decompressedData);
            System.out.println("Metadata loaded successfully.");
            return trie;