diff --git a/Crawler/src/main/java/org/example/crawler.java b/Crawler/src/main/java/org/example/crawler.java index 5233e82df44d80d3bb56447ec8838f2640ac2693..fc70f7923d4aa28a7739eb3d8073e6f91818a6c6 100644 --- a/Crawler/src/main/java/org/example/crawler.java +++ b/Crawler/src/main/java/org/example/crawler.java @@ -1,6 +1,13 @@ package org.example; -import org.jsoup.*; +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.jsoup.helper.Validate; + + import java.util.HashSet; import java.util.LinkedList; import java.io.BufferedReader; @@ -8,11 +15,11 @@ import java.io.FileReader; import java.io.IOException; import java.io.File; -public class crawler { +public class crawler { public static void main(String[] args) throws IOException{ //Read all lines from a given file into the queue - System.out.println(new File(".").getAbsolutePath()); + //System.out.println(new File(".").getAbsolutePath()); FileReader f_read = new FileReader("crawler_test_file.txt"); BufferedReader buf_read = new BufferedReader(f_read); @@ -34,25 +41,59 @@ public class crawler { } public void crawl(int num_sites) { //dequeue current website and add it to visited - String cur_site = url_queue.remove(); - visited_urls.add(cur_site); + // if (num_sites < 2 && !url_queue.isEmpty()) { + String cur_site = url_queue.remove(); + //connect to webpage + Document web_data = get_web_data(cur_site); /* - browse through URLS using jsoup - properly interpret robots.txt - compress + store text metadata */ + if (web_data != null) { + Elements links = web_data.select("a[href]"); + for (Element link : links) { + String link_url = link.attr("abs:href"); + //System.out.println(link_url); + if (!visited_urls.contains(link_url)) { + url_queue.add(link_url); + } + } + } - while ((!url_queue.isEmpty())) { - crawl(++num_sites); - } + //crawl(++num_sites); + //} } + private Document get_web_data(String url) { + try { + //use execute() in order to receive a response object -> allows status code checking + Connection.Response req_response = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") + .referrer("http.//www.google.com") + .execute(); + Document web_data = req_response.parse(); + //ensure that an OK is received + if(req_response.statusCode() == 200) { + visited_urls.add(url); + return web_data; + } + return null; + } + catch(IOException err) { + return null; + } + } public void get_visited() { System.out.println("All of the visited websites:"); for(String url: visited_urls) { System.out.println(url); } +// System.out.println("URL Queue at the end"); +// for(String url: url_queue) { +// System.out.println(url); +// } } //members private final LinkedList<String> url_queue; diff --git a/Crawler/target/classes/org/example/crawler.class b/Crawler/target/classes/org/example/crawler.class index 09db5d9476171e5d22d4497c78b8a11a6d51bf68..f843844cba93e6b529615dc56b2fb17438f1d218 100644 Binary files a/Crawler/target/classes/org/example/crawler.class and b/Crawler/target/classes/org/example/crawler.class differ diff --git a/crawler_test_file.txt b/crawler_test_file.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f772b8eafe1afe598fa71fc98f1236a697b323a --- /dev/null +++ b/crawler_test_file.txt @@ -0,0 +1,23 @@ +https://en.wikipedia.org/wiki/Web_crawler +https://link.springer.com/book/10.1007/978-3-540-46332-0 +http://www.youtube.com +http://www.facebook.com +http://www.yahoo.com +http://www.amazon.com +http://www.wikipedia.org +http://www.twitter.com +http://www.live.com +http://www.bing.com +http://www.instagram.com +http://www.linkedin.com +http://www.msn.com +http://www.vk.com +http://www.hao123.com +http://www.reddit.com +http://www.ebay.com +http://www.t.co +http://www.tmall.com +http://www.sohu.com +http://www.pinterest.com +http://www.netflix.com +http://www.microsoft.com \ No newline at end of file