Commit c6bda859 authored by Tejas Thakur Singh's avatar Tejas Thakur Singh
Browse files

Merge branch '11-implement-realtime-processing' into 'master'

Added real time stats

Closes #11

See merge request ec504/ec504_projects/group7!7
parents b98d25c8 a30e50a2
Loading
Loading
Loading
Loading
+28 −14
Original line number Diff line number Diff line
@@ -21,6 +21,8 @@ import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;

import java.lang.Math;


public class crawler {

@@ -50,7 +52,7 @@ public class crawler {
        } 

        // Start crawling
        final int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -97,15 +99,6 @@ public class crawler {
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data);
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
                        String link_url = link.attr("abs:href");
                        System.out.println("Found link: " + link_url);
                        if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                            System.out.println("Adding link to queue: " + link_url);
                            url_queue.add(link_url);
                        }
                    }
                    visited_urls.add(cur_site);
                    pageCount++;
                }
@@ -156,11 +149,24 @@ public class crawler {
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
        byte[] previousUncompressedData = wordUsage.serialize();
        int foundLinks = 0;
        long startTime = System.nanoTime(),
                endTime;

        Elements links = web_data.select("a[href]");
        for (Element link : links) {
            String link_url = link.attr("abs:href");
            //System.out.println("Found link: " + link_url);
            if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                //System.out.println("Adding link to queue: " + link_url);
                foundLinks++;
                url_queue.add(link_url);
            }
        }

        if(!build_off_corpus) {
            // Break the page text into manageable chunks, considering sentences
            List<String> chunks = splitTextIntoChunks(web_data.text());
            // int chunkCount = 0;
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                return;
@@ -172,7 +178,6 @@ public class crawler {
                extractWordUsage(chunk, wordUsage);
                uncompressedData = wordUsage.serialize();
                compressedData = compress(uncompressedData);

                if ((compressedData.length - compression_size > 1024) ) {
                    System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta: "+ (compressedData.length - compression_size) + " bytes.");
                    System.out.println("Size limit exceeded. Reverting to previous chunk.");
@@ -203,10 +208,19 @@ public class crawler {
        } else {
            System.out.println("Compressed data truncated due to size limit.");
        }
        endTime = System.nanoTime();

        //System.out.println((endTime - startTime)/1000000000.0); // Total time taken to complete processing

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length()/((endTime - startTime)/1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }