Commit 33e0c1a9 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Limit page extraction

parent 935cf3fa
Loading
Loading
Loading
Loading
+20 −2
Original line number Diff line number Diff line
@@ -2,9 +2,9 @@ package edu.bu.LanguageCorrection;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Stack;
import java.util.Map;

public class TrieNode implements Serializable {
public class TrieNode implements Serializable, Cloneable {
    HashMap<String, TrieNode> children = new HashMap<>();
    int count = 0;
    int childCounts = 0;
@@ -111,4 +111,22 @@ public class TrieNode implements Serializable {
            index[0]++; // Skip the "}" marker
        }
    }

    @Override
    public TrieNode clone() {
        try {
            TrieNode clonedNode = (TrieNode) super.clone();
            clonedNode.children = new HashMap<>();
            for (Map.Entry<String, TrieNode> child : this.children.entrySet()) {
                // Recursively clone and add each child to the cloned node
                clonedNode.children.put(child.getKey(), child.getValue().clone());
            }
            // Count and childCounts are primitive types, so they're already correctly
            // copied by super.clone()
            return clonedNode;
        } catch (CloneNotSupportedException e) {
            // This should not happen since we're Cloneable
            throw new AssertionError(e);
        }
    }
}
+102 −27
Original line number Diff line number Diff line
@@ -18,6 +18,9 @@ import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.io.File;


public class crawler {
@@ -38,7 +41,7 @@ public class crawler {
        }

        // Start crawling
        int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -50,6 +53,7 @@ public class crawler {
    private final HashSet<String> visited_urls;
    private String filePath = "metadata.ser";
    private TrieNode wordUsage = new TrieNode();
    int pageCount = 0;

    public crawler() {
        url_queue = new LinkedList<>();
@@ -57,6 +61,27 @@ public class crawler {
        
        // Load trie from file
        wordUsage = loadFile(filePath);

        // Estimate page count based on compressed file size
        pageCount = estimatePageCount(filePath, 1024);
    }

    private int estimatePageCount(String filePath, int avgCompressedSizePerPage) {
        File file = new File(filePath);
        if (!file.exists()) {
            System.out.println("Compressed file does not exist. Starting with page count = 0.");
            return 0;
        }
        long fileSize = file.length();
        System.out.println("Compressed file size: " + fileSize + " bytes");
        int estimatedPages = (int) (fileSize / avgCompressedSizePerPage);
        if (estimatedPages == 0) {
            System.out.println("Estimated number of pages based on compressed file size: " + 1);
            return 1; // At least one page
        } else {
            System.out.println("Estimated number of pages based on compressed file size: " + estimatedPages);
            return estimatedPages;
        }
    }

    public void add_to_queue(String url) {
@@ -64,7 +89,6 @@ public class crawler {
    }

    public void crawl(int maxPages) {
        int pageCount = 0;
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();
            if (cur_site == null || visited_urls.contains(cur_site)) {
@@ -73,6 +97,7 @@ public class crawler {
            try {
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    pageCount++;
                    processPage(web_data);
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
@@ -82,7 +107,6 @@ public class crawler {
                        }
                    }
                    visited_urls.add(cur_site);
                    pageCount++;
                }
            } catch (IOException e) {
                System.err.println("Error processing " + cur_site + ": " + e.getMessage());
@@ -107,33 +131,84 @@ public class crawler {
    }

    private void processPage(Document web_data) {
        /*
        //Print the text content of the Document
        System.out.println("Text content:");
        System.out.println(web_data.text());
        */
        // Extract word usage data
        //System.err.println(web_data.text());
        extractWordUsage(web_data.text(), wordUsage);
        byte[] metadata = wordUsage.serialize();
        System.out.println("Metadata extracted successfully.");

        // STORE UNCOMPRESSED DATA //
        System.out.println("Unompressed metadata size: " + metadata.length + " bytes");
        writeToFile(metadata, "uncompressed-"+filePath);
        
        // COMPRESS DATA //
        byte[] compressedData = compress(metadata);
        if (compressedData.length <= 1024) {
            System.out.println("Successfully compressed metadata within the 1KB limit.");
        byte[] compressedData = new byte[0];
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
        TrieNode previousWordUsageState = null; // Placeholder for the previous state of the trie
        byte[] previousUncompressedData = new byte[0]; // Placeholder for the previous uncompressed data

        // Break the page text into manageable chunks, considering sentences
        List<String> chunks = splitTextIntoChunks(web_data.text(), 100);

        for (String chunk : chunks) {
            previousWordUsageState = wordUsage.clone();
            previousUncompressedData = uncompressedData.clone();

            // Process each chunk
            extractWordUsage(chunk, wordUsage);
            uncompressedData = wordUsage.serialize();

            // Compress the serialized trie
            compressedData = compress(uncompressedData);

            // Check if compressed data size exceeds 1KB * pageCount
            if (compressedData.length > 1024 * pageCount) {
                System.out.println("Size limit exceeded. Reverting to previous chunk.");
                sizeLimitExceeded = true;
                wordUsage = previousWordUsageState; // Revert to the previous state of the trie
                uncompressedData = previousUncompressedData; // Revert to the previous uncompressed data
                compressedData = compress(uncompressedData); // Recompress the reverted state
                break; // Stop processing further chunks
            }
        }

        // Save the uncompressed and compressed data to separate files
        String uncompressedFilePath = "uncompressed-" + filePath;
        writeToFile(uncompressedData, uncompressedFilePath);
        System.out.println("Uncompressed data exported successfully to: " + uncompressedFilePath);

        writeToFile(compressedData, filePath);
        if (!sizeLimitExceeded) {
            System.out.println("Compressed tree exported successfully to: " + filePath);
        } else {
            System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization.");
            System.out.println("Compressed data truncated due to size limit.");
        }

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
    }

        // STORE COMPRESSED DATA //
        writeToFile(compressedData, filePath);
        System.out.println("Tree exported successfully to: " + filePath);
    private List<String> splitTextIntoChunks(String text, int chunkSize) {
        // Split the text into sentences.
        String[] sentences = text.split("[.!?] ");
        List<String> chunks = new ArrayList<>();
        StringBuilder currentChunk = new StringBuilder();

        for (String sentence : sentences) {
            if (currentChunk.length() + sentence.length() + (currentChunk.length() > 0 ? 1 : 0) > chunkSize) {
                if (currentChunk.length() > 0) {
                    chunks.add(currentChunk.toString());
                    currentChunk = new StringBuilder();
                }
                while (sentence.length() > chunkSize) {
                    chunks.add(sentence.substring(0, chunkSize));
                    sentence = sentence.substring(chunkSize);
                }
            }
            // Add a space before the sentence if it's not the first sentence in the chunk.
            if (currentChunk.length() > 0) {
                currentChunk.append(" ");
            }
            currentChunk.append(sentence);
        }

        // Add the last chunk if it's not empty.
        if (currentChunk.length() > 0) {
            chunks.add(currentChunk.toString());
        }

        return chunks;
    }