Commit 90a79645 authored by Leon Zeyu Long's avatar Leon Zeyu Long
Browse files

Update crawler.java to include data compression that is stored in an output file

parent efb8656e
Loading
Loading
Loading
Loading
+149 −60
Original line number Diff line number Diff line
package org.example;

import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.helper.Validate;


import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.zip.Deflater;
import java.util.PriorityQueue;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

import java.io.File;
import java.nio.charset.StandardCharsets;

public class crawler {
    public static void main(String[] args) throws IOException {
        //Read all lines from a given file into the queue
        //System.out.println(new File(".").getAbsolutePath());

        FileReader f_read = new FileReader("crawler_test_file.txt");
        BufferedReader buf_read = new BufferedReader(f_read);
        // Initialize web crawler
        crawler web_crawler = new crawler();

        // Open the file and read all lines (URLs) into the queue
        try (FileReader f_read = new FileReader("crawler_test_file.txt");
             BufferedReader buf_read = new BufferedReader(f_read)) {
            String url_line;
            while ((url_line = buf_read.readLine()) != null) {
                web_crawler.add_to_queue(url_line);
            }
        web_crawler.crawl(0);
        }

        // Start crawling
        int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
        web_crawler.get_visited();
    }

    //members
    private final LinkedList<String> url_queue;
    private final HashSet<String> visited_urls;

    public crawler() {
        url_queue = new LinkedList<String>();
        visited_urls = new HashSet<String>();
        url_queue = new LinkedList<>();
        visited_urls = new HashSet<>();
    }

    public void add_to_queue(String url) {
        url_queue.add(url);
    }
    public void crawl(int num_sites) {
        //dequeue current website and add it to visited
       // if (num_sites < 2 && !url_queue.isEmpty()) {
            String cur_site = url_queue.remove();

            //connect to webpage
    public void crawl(int maxPages) {
        int pageCount = 0;
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();
            if (cur_site == null || visited_urls.contains(cur_site)) {
                continue;
            }
            try {
                Document web_data = get_web_data(cur_site);
        /*
            - browse through URLS using jsoup
            - properly interpret robots.txt
            - compress + store text metadata
        */
                if (web_data != null) {
                    processPage(web_data);
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
                        String link_url = link.attr("abs:href");
                    //System.out.println(link_url);
                    if (!visited_urls.contains(link_url)) {
                        if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                            url_queue.add(link_url);
                        }
                    }
                    visited_urls.add(cur_site);
                    pageCount++;
                }

            //crawl(++num_sites);
        //}
            } catch (IOException e) {
                System.err.println("Error processing " + cur_site + ": " + e.getMessage());
            }
    private Document get_web_data(String url) {
        try {
        }
        System.out.println("Total pages visited: " + pageCount);
    }    

    private Document get_web_data(String url) throws IOException {
        //use execute() in order to receive a response object -> allows status code checking
        Connection.Response req_response = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
                            .referrer("http.//www.google.com")
                .referrer("http://www.google.com")
                .execute();
            Document web_data = req_response.parse();

        //ensure that an OK is received
        if (req_response.statusCode() == 200) {
                visited_urls.add(url);
                return web_data;
            return req_response.parse();
        } else {
            throw new HttpStatusException("Status not OK", req_response.statusCode(), url);
        }
    }

    private void processPage(Document web_data) {
        /*
        //Print the text content of the Document
        System.out.println("Text content:");
        System.out.println(web_data.text());
        */

        // Extract word usage data
        Map<String, Integer> wordUsage = extractWordUsage(web_data.text());
        String metadata = serializeWordUsage(wordUsage);

        try {
            // Write the serialized word usage metadata to a file
            try (FileWriter writer = new FileWriter("metadata.txt", true)) {
                writer.write(metadata + System.lineSeparator());
                writer.flush();
            }
            System.out.println("Serialized word usage metadata written to metadata.txt");
        } catch (IOException e) {
            System.err.println("Error writing metadata to file: " + e.getMessage());
        }
        
        // Compress the metadata
        byte[] compressedData = compress(metadata);
        
        // Print size of compressed data
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");

        // STORE COMPRESSED DATA //
    }

    private Map<String, Integer> extractWordUsage(String text) {
        Map<String, Integer> wordCount = new HashMap<>();
        // Split by whitespace and count occurrences
        for (String word : text.split("\\s+")) {
            wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
        }
        return wordCount;
    }

    private String serializeWordUsage(Map<String, Integer> wordUsage) {
        // Convert word usage to string
        StringBuilder builder = new StringBuilder();
        for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) {
            builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";");
        }
            return null;
        return builder.toString();
    }
        catch(IOException err) {
            return null;

    private byte[] compress(String data) {
        try {
            byte[] input = data.getBytes("UTF-8");
            // Create compressor
            Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION);
            compressor.setInput(input);
            compressor.finish();

            // Store compressed data in dynamic byte array
            ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);

            byte[] buf = new byte[1024];
            while (!compressor.finished()) {
                int count = compressor.deflate(buf);
                bos.write(buf, 0, count);
                if (bos.size() > 1024) {
                    // Compressed data > 1KB
                    System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization.");
                    break;
                }
            }
            compressor.end();

            byte[] compressedData = bos.toByteArray();
            System.out.println("Successfully compressed metadata within the 1KB limit.");

            return compressedData;
        } catch (IOException e) {
            System.err.println("Compression error: " + e.getMessage());
            return new byte[0];
        }
    }

    // Output visited URLs as specified
    public void get_visited() {
        System.out.println("All of the visited websites:");
        for (String url : visited_urls) {
            System.out.println(url);
        }
//        System.out.println("URL Queue at the end");
//        for(String url: url_queue) {
//            System.out.println(url);
//        }
    }
    //members
    private final LinkedList<String> url_queue;
    private final HashSet<String> visited_urls;
}