Loading Crawler/src/main/java/org/example/crawler.java +49 −8 Original line number Diff line number Diff line package org.example; import org.jsoup.*; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.helper.Validate; import java.util.HashSet; import java.util.LinkedList; import java.io.BufferedReader; Loading @@ -8,11 +15,11 @@ import java.io.FileReader; import java.io.IOException; import java.io.File; public class crawler { public class crawler { public static void main(String[] args) throws IOException{ //Read all lines from a given file into the queue System.out.println(new File(".").getAbsolutePath()); //System.out.println(new File(".").getAbsolutePath()); FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read); Loading @@ -34,25 +41,59 @@ public class crawler { } public void crawl(int num_sites) { //dequeue current website and add it to visited // if (num_sites < 2 && !url_queue.isEmpty()) { String cur_site = url_queue.remove(); visited_urls.add(cur_site); //connect to webpage Document web_data = get_web_data(cur_site); /* - browse through URLS using jsoup - properly interpret robots.txt - compress + store text metadata */ while ((!url_queue.isEmpty())) { crawl(++num_sites); if (web_data != null) { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); //System.out.println(link_url); if (!visited_urls.contains(link_url)) { url_queue.add(link_url); } } } //crawl(++num_sites); //} } private Document get_web_data(String url) { try { //use execute() in order to receive a response object -> allows status code checking Connection.Response req_response = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") .referrer("http.//www.google.com") .execute(); Document web_data = req_response.parse(); //ensure that an OK is received if(req_response.statusCode() == 200) { visited_urls.add(url); return web_data; } return null; } catch(IOException err) { return null; } } public void get_visited() { System.out.println("All of the visited websites:"); for(String url: visited_urls) { System.out.println(url); } // System.out.println("URL Queue at the end"); // for(String url: url_queue) { // System.out.println(url); // } } //members private final LinkedList<String> url_queue; Loading Crawler/target/classes/org/example/crawler.class +1.3 KiB (3.42 KiB) File changed.No diff preview for this file type. View original file View changed file crawler_test_file.txt 0 → 100644 +23 −0 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://link.springer.com/book/10.1007/978-3-540-46332-0 http://www.youtube.com http://www.facebook.com http://www.yahoo.com http://www.amazon.com http://www.wikipedia.org http://www.twitter.com http://www.live.com http://www.bing.com http://www.instagram.com http://www.linkedin.com http://www.msn.com http://www.vk.com http://www.hao123.com http://www.reddit.com http://www.ebay.com http://www.t.co http://www.tmall.com http://www.sohu.com http://www.pinterest.com http://www.netflix.com http://www.microsoft.com No newline at end of file Loading
Crawler/src/main/java/org/example/crawler.java +49 −8 Original line number Diff line number Diff line package org.example; import org.jsoup.*; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.helper.Validate; import java.util.HashSet; import java.util.LinkedList; import java.io.BufferedReader; Loading @@ -8,11 +15,11 @@ import java.io.FileReader; import java.io.IOException; import java.io.File; public class crawler { public class crawler { public static void main(String[] args) throws IOException{ //Read all lines from a given file into the queue System.out.println(new File(".").getAbsolutePath()); //System.out.println(new File(".").getAbsolutePath()); FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read); Loading @@ -34,25 +41,59 @@ public class crawler { } public void crawl(int num_sites) { //dequeue current website and add it to visited // if (num_sites < 2 && !url_queue.isEmpty()) { String cur_site = url_queue.remove(); visited_urls.add(cur_site); //connect to webpage Document web_data = get_web_data(cur_site); /* - browse through URLS using jsoup - properly interpret robots.txt - compress + store text metadata */ while ((!url_queue.isEmpty())) { crawl(++num_sites); if (web_data != null) { Elements links = web_data.select("a[href]"); for (Element link : links) { String link_url = link.attr("abs:href"); //System.out.println(link_url); if (!visited_urls.contains(link_url)) { url_queue.add(link_url); } } } //crawl(++num_sites); //} } private Document get_web_data(String url) { try { //use execute() in order to receive a response object -> allows status code checking Connection.Response req_response = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") .referrer("http.//www.google.com") .execute(); Document web_data = req_response.parse(); //ensure that an OK is received if(req_response.statusCode() == 200) { visited_urls.add(url); return web_data; } return null; } catch(IOException err) { return null; } } public void get_visited() { System.out.println("All of the visited websites:"); for(String url: visited_urls) { System.out.println(url); } // System.out.println("URL Queue at the end"); // for(String url: url_queue) { // System.out.println(url); // } } //members private final LinkedList<String> url_queue; Loading
Crawler/target/classes/org/example/crawler.class +1.3 KiB (3.42 KiB) File changed.No diff preview for this file type. View original file View changed file
crawler_test_file.txt 0 → 100644 +23 −0 Original line number Diff line number Diff line https://en.wikipedia.org/wiki/Web_crawler https://link.springer.com/book/10.1007/978-3-540-46332-0 http://www.youtube.com http://www.facebook.com http://www.yahoo.com http://www.amazon.com http://www.wikipedia.org http://www.twitter.com http://www.live.com http://www.bing.com http://www.instagram.com http://www.linkedin.com http://www.msn.com http://www.vk.com http://www.hao123.com http://www.reddit.com http://www.ebay.com http://www.t.co http://www.tmall.com http://www.sohu.com http://www.pinterest.com http://www.netflix.com http://www.microsoft.com No newline at end of file