Loading Crawler/src/main/java/org/example/crawler.java +149 −60 Original line number Diff line number Diff line package org.example; import org.jsoup.Connection; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.helper.Validate; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.zip.Deflater; import java.util.PriorityQueue; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.File; import java.nio.charset.StandardCharsets; public class crawler { public static void main(String[] args) throws IOException { //Read all lines from a given file into the queue //System.out.println(new File(".").getAbsolutePath()); FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read); // Initialize web crawler crawler web_crawler = new crawler(); // Open the file and read all lines (URLs) into the queue try (FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read)) { String url_line; while ((url_line = buf_read.readLine()) != null) { web_crawler.add_to_queue(url_line); } web_crawler.crawl(0); } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs web_crawler.get_visited(); } //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; public crawler() { url_queue = new LinkedList<String>(); visited_urls = new HashSet<String>(); url_queue = new LinkedList<>(); visited_urls = new HashSet<>(); } public void add_to_queue(String url) { url_queue.add(url); } public void crawl(int num_sites) { //dequeue current website and add it to visited // if (num_sites < 2 && !url_queue.isEmpty()) { String cur_site = url_queue.remove(); //connect to webpage public void crawl(int maxPages) { int pageCount = 0; while (!url_queue.isEmpty() && pageCount < maxPages) { String cur_site = url_queue.poll(); if (cur_site == null || visited_urls.contains(cur_site)) { continue; } try { Document web_data = get_web_data(cur_site); /* - browse through URLS using jsoup - properly interpret robots.txt - compress + store text metadata */ if (web_data != null) { processPage(web_data); Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); //System.out.println(link_url); if (!visited_urls.contains(link_url)) { if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { url_queue.add(link_url); } } visited_urls.add(cur_site); pageCount++; } //crawl(++num_sites); //} } catch (IOException e) { System.err.println("Error processing " + cur_site + ": " + e.getMessage()); } private Document get_web_data(String url) { try { } System.out.println("Total pages visited: " + pageCount); } private Document get_web_data(String url) throws IOException { //use execute() in order to receive a response object -> allows status code checking Connection.Response req_response = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") .referrer("http.//www.google.com") .referrer("http://www.google.com") .execute(); Document web_data = req_response.parse(); //ensure that an OK is received if (req_response.statusCode() == 200) { visited_urls.add(url); return web_data; return req_response.parse(); } else { throw new HttpStatusException("Status not OK", req_response.statusCode(), url); } } private void processPage(Document web_data) { /* //Print the text content of the Document System.out.println("Text content:"); System.out.println(web_data.text()); */ // Extract word usage data Map<String, Integer> wordUsage = extractWordUsage(web_data.text()); String metadata = serializeWordUsage(wordUsage); try { // Write the serialized word usage metadata to a file try (FileWriter writer = new FileWriter("metadata.txt", true)) { writer.write(metadata + System.lineSeparator()); writer.flush(); } System.out.println("Serialized word usage metadata written to metadata.txt"); } catch (IOException e) { System.err.println("Error writing metadata to file: " + e.getMessage()); } // Compress the metadata byte[] compressedData = compress(metadata); // Print size of compressed data System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); // STORE COMPRESSED DATA // } private Map<String, Integer> extractWordUsage(String text) { Map<String, Integer> wordCount = new HashMap<>(); // Split by whitespace and count occurrences for (String word : text.split("\\s+")) { wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); } return wordCount; } private String serializeWordUsage(Map<String, Integer> wordUsage) { // Convert word usage to string StringBuilder builder = new StringBuilder(); for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) { builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";"); } return null; return builder.toString(); } catch(IOException err) { return null; private byte[] compress(String data) { try { byte[] input = data.getBytes("UTF-8"); // Create compressor Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION); compressor.setInput(input); compressor.finish(); // Store compressed data in dynamic byte array ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); byte[] buf = new byte[1024]; while (!compressor.finished()) { int count = compressor.deflate(buf); bos.write(buf, 0, count); if (bos.size() > 1024) { // Compressed data > 1KB System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization."); break; } } compressor.end(); byte[] compressedData = bos.toByteArray(); System.out.println("Successfully compressed metadata within the 1KB limit."); return compressedData; } catch (IOException e) { System.err.println("Compression error: " + e.getMessage()); return new byte[0]; } } // Output visited URLs as specified public void get_visited() { System.out.println("All of the visited websites:"); for (String url : visited_urls) { System.out.println(url); } // System.out.println("URL Queue at the end"); // for(String url: url_queue) { // System.out.println(url); // } } //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; } Loading
Crawler/src/main/java/org/example/crawler.java +149 −60 Original line number Diff line number Diff line package org.example; import org.jsoup.Connection; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.helper.Validate; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.zip.Deflater; import java.util.PriorityQueue; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.File; import java.nio.charset.StandardCharsets; public class crawler { public static void main(String[] args) throws IOException { //Read all lines from a given file into the queue //System.out.println(new File(".").getAbsolutePath()); FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read); // Initialize web crawler crawler web_crawler = new crawler(); // Open the file and read all lines (URLs) into the queue try (FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read)) { String url_line; while ((url_line = buf_read.readLine()) != null) { web_crawler.add_to_queue(url_line); } web_crawler.crawl(0); } // Start crawling int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE) web_crawler.crawl(crawlLimit); // Print visited URLs web_crawler.get_visited(); } //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; public crawler() { url_queue = new LinkedList<String>(); visited_urls = new HashSet<String>(); url_queue = new LinkedList<>(); visited_urls = new HashSet<>(); } public void add_to_queue(String url) { url_queue.add(url); } public void crawl(int num_sites) { //dequeue current website and add it to visited // if (num_sites < 2 && !url_queue.isEmpty()) { String cur_site = url_queue.remove(); //connect to webpage public void crawl(int maxPages) { int pageCount = 0; while (!url_queue.isEmpty() && pageCount < maxPages) { String cur_site = url_queue.poll(); if (cur_site == null || visited_urls.contains(cur_site)) { continue; } try { Document web_data = get_web_data(cur_site); /* - browse through URLS using jsoup - properly interpret robots.txt - compress + store text metadata */ if (web_data != null) { processPage(web_data); Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); //System.out.println(link_url); if (!visited_urls.contains(link_url)) { if (!link_url.isEmpty() && !visited_urls.contains(link_url)) { url_queue.add(link_url); } } visited_urls.add(cur_site); pageCount++; } //crawl(++num_sites); //} } catch (IOException e) { System.err.println("Error processing " + cur_site + ": " + e.getMessage()); } private Document get_web_data(String url) { try { } System.out.println("Total pages visited: " + pageCount); } private Document get_web_data(String url) throws IOException { //use execute() in order to receive a response object -> allows status code checking Connection.Response req_response = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") .referrer("http.//www.google.com") .referrer("http://www.google.com") .execute(); Document web_data = req_response.parse(); //ensure that an OK is received if (req_response.statusCode() == 200) { visited_urls.add(url); return web_data; return req_response.parse(); } else { throw new HttpStatusException("Status not OK", req_response.statusCode(), url); } } private void processPage(Document web_data) { /* //Print the text content of the Document System.out.println("Text content:"); System.out.println(web_data.text()); */ // Extract word usage data Map<String, Integer> wordUsage = extractWordUsage(web_data.text()); String metadata = serializeWordUsage(wordUsage); try { // Write the serialized word usage metadata to a file try (FileWriter writer = new FileWriter("metadata.txt", true)) { writer.write(metadata + System.lineSeparator()); writer.flush(); } System.out.println("Serialized word usage metadata written to metadata.txt"); } catch (IOException e) { System.err.println("Error writing metadata to file: " + e.getMessage()); } // Compress the metadata byte[] compressedData = compress(metadata); // Print size of compressed data System.out.println("Compressed metadata size: " + compressedData.length + " bytes"); // STORE COMPRESSED DATA // } private Map<String, Integer> extractWordUsage(String text) { Map<String, Integer> wordCount = new HashMap<>(); // Split by whitespace and count occurrences for (String word : text.split("\\s+")) { wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); } return wordCount; } private String serializeWordUsage(Map<String, Integer> wordUsage) { // Convert word usage to string StringBuilder builder = new StringBuilder(); for (Map.Entry<String, Integer> entry : wordUsage.entrySet()) { builder.append(entry.getKey()).append(":").append(entry.getValue()).append(";"); } return null; return builder.toString(); } catch(IOException err) { return null; private byte[] compress(String data) { try { byte[] input = data.getBytes("UTF-8"); // Create compressor Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION); compressor.setInput(input); compressor.finish(); // Store compressed data in dynamic byte array ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); byte[] buf = new byte[1024]; while (!compressor.finished()) { int count = compressor.deflate(buf); bos.write(buf, 0, count); if (bos.size() > 1024) { // Compressed data > 1KB System.out.println("WARNING: Compressed data exceeds 1KB limit. Consider optimization."); break; } } compressor.end(); byte[] compressedData = bos.toByteArray(); System.out.println("Successfully compressed metadata within the 1KB limit."); return compressedData; } catch (IOException e) { System.err.println("Compression error: " + e.getMessage()); return new byte[0]; } } // Output visited URLs as specified public void get_visited() { System.out.println("All of the visited websites:"); for (String url : visited_urls) { System.out.println(url); } // System.out.println("URL Queue at the end"); // for(String url: url_queue) { // System.out.println(url); // } } //members private final LinkedList<String> url_queue; private final HashSet<String> visited_urls; }