Commit a30e50a2 authored by Tejas Thakur Singh's avatar Tejas Thakur Singh Committed by tejassng
Browse files

Added processing rate, changed format of links found to display only the...

Added processing rate, changed format of links found to display only the number found in order to reduce clutter
parent b98d25c8
Loading
Loading
Loading
Loading
+28 −14
Original line number Diff line number Diff line
@@ -21,6 +21,8 @@ import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;

import java.lang.Math;


public class crawler {

@@ -50,7 +52,7 @@ public class crawler {
        } 

        // Start crawling
        final int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -97,15 +99,6 @@ public class crawler {
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data);
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
                        String link_url = link.attr("abs:href");
                        System.out.println("Found link: " + link_url);
                        if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                            System.out.println("Adding link to queue: " + link_url);
                            url_queue.add(link_url);
                        }
                    }
                    visited_urls.add(cur_site);
                    pageCount++;
                }
@@ -156,11 +149,24 @@ public class crawler {
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
        byte[] previousUncompressedData = wordUsage.serialize();
        int foundLinks = 0;
        long startTime = System.nanoTime(),
                endTime;

        Elements links = web_data.select("a[href]");
        for (Element link : links) {
            String link_url = link.attr("abs:href");
            //System.out.println("Found link: " + link_url);
            if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                //System.out.println("Adding link to queue: " + link_url);
                foundLinks++;
                url_queue.add(link_url);
            }
        }

        if(!build_off_corpus) {
            // Break the page text into manageable chunks, considering sentences
            List<String> chunks = splitTextIntoChunks(web_data.text());
            // int chunkCount = 0;
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                return;
@@ -172,7 +178,6 @@ public class crawler {
                extractWordUsage(chunk, wordUsage);
                uncompressedData = wordUsage.serialize();
                compressedData = compress(uncompressedData);

                if ((compressedData.length - compression_size > 1024) ) {
                    System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Size limit exceeded. Reverting to previous chunk.");
@@ -203,10 +208,19 @@ public class crawler {
        } else {
            System.out.println("Compressed data truncated due to size limit.");
        }
        endTime = System.nanoTime();

        //System.out.println((endTime - startTime)/1000000000.0); // Total time taken to complete processing

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length()/((endTime - startTime)/1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }