Commit 73343d3e authored by Moises Bensadon's avatar Moises Bensadon
Browse files

create large trie based on brown corpus by running ./crawler --build

parent 9f304279
Loading
Loading
Loading
Loading
+76 −32
Original line number Diff line number Diff line
@@ -30,8 +30,16 @@ public class crawler {

        // Open the file and read all lines (URLs) into the queue
        String file_url = "";
        if (args.length != 0 && args[0].equals("--file")) file_url = args[1];
        if (args.length > 2 && args[2].equals("--debug")) web_crawler.debug = true;
        for (int i = 0; i < args.length; i++) {
            if (args[i].equals("--file")) {
                file_url = args[i + 1];
            } else if (args[i].equals("--debug")) {
                web_crawler.debug = true;
            } else if (args[i].equals("--build")) {
                web_crawler.build_off_corpus = true;
            }
        } 
        if (!web_crawler.build_off_corpus) {
            try (FileReader f_read = new FileReader(file_url);
                BufferedReader buf_read = new BufferedReader(f_read)) {
                String url_line;
@@ -39,9 +47,10 @@ public class crawler {
                    web_crawler.add_to_queue(url_line);
                }
            }
        } 

        // Start crawling
        int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -54,7 +63,9 @@ public class crawler {
    private String filePath = "metadata.ser";
    private TrieNode wordUsage = new TrieNode();
    private int compression_size = 0;
    private boolean debug = false;
    private boolean debug = false; // flag that outputs uncompressed json showing the trie
    private boolean build_off_corpus = false;
    private static final int MAXNGRAM = 3;

    public crawler() {
        url_queue = new LinkedList<>();
@@ -72,6 +83,10 @@ public class crawler {

    public void crawl(int maxPages) {
        int pageCount = 0;
        if (build_off_corpus) {
            System.out.println("Building off corpus...");
            processPage(get_file_text("brown.txt")); // TODO: extend to multiple files i.e. one per language
        }
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();
            if (cur_site == null || visited_urls.contains(cur_site)) {
@@ -116,12 +131,33 @@ public class crawler {
        }
    }

    private Document get_file_text(String filename) {
        // takes filename as input, reads it from src/main/resources and returns the text as a Document object
        try {
            Document doc = new Document("");
            doc.title(filename);
            // Add each line to the document
            try (BufferedReader reader = new BufferedReader(new FileReader("src/main/java/resources/"+filename))) {
                String line;
                while ((line = reader.readLine()) != null) {
                    doc.append(line);
                }
                System.out.println("File read successfully.");
            }
            return doc;
        } catch (IOException e) {
            System.err.println("Error reading file: " + e.getMessage());
            return null;
        }
    }

    private void processPage(Document web_data) {
        byte[] compressedData = new byte[0];
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
        byte[] previousUncompressedData = wordUsage.serialize();

        if(!build_off_corpus) {
            // Break the page text into manageable chunks, considering sentences
            List<String> chunks = splitTextIntoChunks(web_data.text());
            // int chunkCount = 0;
@@ -137,7 +173,7 @@ public class crawler {
                uncompressedData = wordUsage.serialize();
                compressedData = compress(uncompressedData);

            if (compressedData.length - compression_size > 1024) {
                if ((compressedData.length - compression_size > 1024) ) {
                    System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta:"+ (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Size limit exceeded. Reverting to previous chunk.");
                    sizeLimitExceeded = true;
@@ -146,6 +182,13 @@ public class crawler {
                    break; // Stop processing further chunks
                }
            }
        } else {
            extractWordUsage(web_data.text(), wordUsage);
            // System.out.println("Ngrams built successfully. for size:"+MAXNGRAM);
            uncompressedData = wordUsage.serialize();
            compressedData = compress(uncompressedData);
            build_off_corpus = false; // if there are urls to read it should still be able to read them
        }

        // Save the uncompressed and compressed data to separate files
        if (debug) {
@@ -183,7 +226,7 @@ public class crawler {
        // Split text into sentences
        String[] sentences = text.split("[.!?] ");
        for (String sentence : sentences) {
            for (int nGram = 1; nGram <= 3; nGram++) {
            for (int nGram = 1; nGram <= MAXNGRAM; nGram++) {
                String[] words = sentence.split("\\s+");
                for (int i = 0; i < words.length - nGram + 1; i++) {
                    trie.insert(Arrays.copyOfRange(words, i, i + nGram));
@@ -244,6 +287,7 @@ public class crawler {
            return trie;
        } catch (IOException e) {
            System.err.println("Error reading metadata from file: " + e.getMessage());
            System.err.println("Creating new trie...");
            return new TrieNode();
        }
    }

src/main/java/resources/test.txt

deleted100644 → 0
+0 −1
Original line number Diff line number Diff line
This is strange so choice word.
 No newline at end of file