Update crawler.java to include data compression that is stored in an output file (90a79645) · Commits · EC504 Spring 2024 Group Projects / Group7

Crawler/src/main/java/org/example/crawler.java

+149 −60

Original line number	Diff line number	Diff line
		package org.example;

		import org.jsoup.Connection;
		import org.jsoup.HttpStatusException;
		import org.jsoup.Jsoup;
		import org.jsoup.nodes.Document;
		import org.jsoup.nodes.Element;
		import org.jsoup.select.Elements;
		import org.jsoup.helper.Validate;


		import java.util.HashMap;
		import java.util.HashSet;
		import java.util.LinkedList;
		import java.util.Map;
		import java.util.zip.Deflater;
		import java.util.PriorityQueue;
		import java.io.BufferedReader;
		import java.io.ByteArrayOutputStream;
		import java.io.FileReader;
		import java.io.FileWriter;
		import java.io.IOException;

		import java.io.File;
		import java.nio.charset.StandardCharsets;

		public class crawler {
		public static void main(String[] args) throws IOException {
		//Read all lines from a given file into the queue
		//System.out.println(new File(".").getAbsolutePath());

		FileReader f_read = new FileReader("crawler_test_file.txt");
		BufferedReader buf_read = new BufferedReader(f_read);
		// Initialize web crawler
		crawler web_crawler = new crawler();

		// Open the file and read all lines (URLs) into the queue
		try (FileReader f_read = new FileReader("crawler_test_file.txt");
		BufferedReader buf_read = new BufferedReader(f_read)) {
		String url_line;
		while ((url_line = buf_read.readLine()) != null) {
		web_crawler.add_to_queue(url_line);
		}
		web_crawler.crawl(0);
		}

		// Start crawling
		int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		web_crawler.crawl(crawlLimit);

		// Print visited URLs
		web_crawler.get_visited();
		}

		//members
		private final LinkedList<String> url_queue;
		private final HashSet<String> visited_urls;

		public crawler() {
		url_queue = new LinkedList<String>();
		visited_urls = new HashSet<String>();
		url_queue = new LinkedList<>();
		visited_urls = new HashSet<>();
		}

		public void add_to_queue(String url) {
		url_queue.add(url);
		}
		public void crawl(int num_sites) {
		//dequeue current website and add it to visited
		// if (num_sites < 2 && !url_queue.isEmpty()) {
		String cur_site = url_queue.remove();

		//connect to webpage
		public void crawl(int maxPages) {
		int pageCount = 0;
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();
		if (cur_site == null \|\| visited_urls.contains(cur_site)) {
		continue;
		}
		try {
		Document web_data = get_web_data(cur_site);
		/*
		- browse through URLS using jsoup
		- properly interpret robots.txt
		- compress + store text metadata
		*/
		if (web_data != null) {
		processPage(web_data);
		Elements links = web_data.select("a[href]");
		for (Element link : links) {
		String link_url = link.attr("abs:href");
		//System.out.println(link_url);
		if (!visited_urls.contains(link_url)) {
		if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
		url_queue.add(link_url);
		}
		}
		visited_urls.add(cur_site);
		pageCount++;
		}

		//crawl(++num_sites);
		//}
		} catch (IOException e) {
		System.err.println("Error processing " + cur_site + ": " + e.getMessage());
		}
		private Document get_web_data(String url) {
		try {
		}
		System.out.println("Total pages visited: " + pageCount);
		}

		private Document get_web_data(String url) throws IOException {
		//use execute() in order to receive a response object -> allows status code checking
		Connection.Response req_response = Jsoup.connect(url)
		.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
		.referrer("http.//www.google.com")
		.referrer("http://www.google.com")
		.execute();
		Document web_data = req_response.parse();

		//ensure that an OK is received
		if (req_response.statusCode() == 200) {
		visited_urls.add(url);
		return web_data;
		return req_response.parse();
		} else {
		throw new HttpStatusException("Status not OK", req_response.statusCode(), url);
		}
		}

		private void processPage(Document web_data) {
		/*
		//Print the text content of the Document
		System.out.println("Text content:");
		System.out.println(web_data.text());
		*/

		// Extract word usage data
		Map<String, Integer> wordUsage = extractWordUsage(web_data.text());
		String metadata = serializeWordUsage(wordUsage);

		try {
		// Write the serialized word usage metadata to a file
		try (FileWriter writer = new FileWriter("metadata.txt", true)) {
		writer.write(metadata + System.lineSeparator());
		writer.flush();
		}
		System.out.println("Serialized word usage metadata written to metadata.txt");
		} catch (IOException e) {
		System.err.println("Error writing metadata to file: " + e.getMessage());
		}

		// Compress the metadata
		byte[] compressedData = compress(metadata);

		// Print size of compressed data
		System.out.println("Compressed metadata size: " + compressedData.length + " bytes");

		// STORE COMPRESSED DATA //
		}

		private Map<String, Integer> extractWordUsage(String text) {
		Map<String, Integer> wordCount = new HashMap<>();
		// Split by whitespace and count occurrences
		for (String word : text.split("\\s+")) {
		wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
		}
		return wordCount;
		}

		private String serializeWordUsage(Map<String, Integer> wordUsage) {
		// Convert word usage to string
		StringBuilder builder = new StringBuilder();
		for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) {
		builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";");
		}
		return null;
		return builder.toString();
		}
		catch(IOException err) {
		return null;

		private byte[] compress(String data) {
		try {
		byte[] input = data.getBytes("UTF-8");
		// Create compressor
		Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION);
		compressor.setInput(input);
		compressor.finish();

		// Store compressed data in dynamic byte array
		ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);

		byte[] buf = new byte[1024];
		while (!compressor.finished()) {
		int count = compressor.deflate(buf);
		bos.write(buf, 0, count);
		if (bos.size() > 1024) {
		// Compressed data > 1KB
		System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization.");
		break;
		}
		}
		compressor.end();

		byte[] compressedData = bos.toByteArray();
		System.out.println("Successfully compressed metadata within the 1KB limit.");

		return compressedData;
		} catch (IOException e) {
		System.err.println("Compression error: " + e.getMessage());
		return new byte[0];
		}
		}

		// Output visited URLs as specified
		public void get_visited() {
		System.out.println("All of the visited websites:");
		for (String url : visited_urls) {
		System.out.println(url);
		}
		// System.out.println("URL Queue at the end");
		// for(String url: url_queue) {
		// System.out.println(url);
		// }
		}
		//members
		private final LinkedList<String> url_queue;
		private final HashSet<String> visited_urls;
		}