diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000000000000000000000000000000000000..d2cebb0efc39e8710b68f40bc080e4f51a64b2f3 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000000000000000000000000000000000000..b7aa75eb9a45f1f4e52d34c43f010b575a3857f4 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/group7.iml b/.idea/group7.iml new file mode 100644 index 0000000000000000000000000000000000000000..d6ebd4805981b8400db3e3291c74a743fef9a824 --- /dev/null +++ b/.idea/group7.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000000000000000000000000000000000000..712ab9d985c20018a0c97b93d2148ac1ffe588a5 --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..905619eeeb1728d0017a45d8f572326c923146ef --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..5ce3c5e116c4f3c414e8445650743f9153a59318 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000000000000000000000000000000000000..2b63946d5b31084bbb7dda418ceb3d75eb686373 --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..35eb1ddfbbc029bcab630581847471d7f238ec53 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Crawler/pom.xml b/Crawler/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..e1d04f160b3c2b132dcce15c54e93fa6bfb3d9bc --- /dev/null +++ b/Crawler/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + org.example + Crawler + 1.0-SNAPSHOT + + + 18 + 18 + UTF-8 + + + + + org.jsoup + jsoup + 1.15.3 + + + + \ No newline at end of file diff --git a/Crawler/src/main/java/org/example/crawler.java b/Crawler/src/main/java/org/example/crawler.java new file mode 100644 index 0000000000000000000000000000000000000000..5233e82df44d80d3bb56447ec8838f2640ac2693 --- /dev/null +++ b/Crawler/src/main/java/org/example/crawler.java @@ -0,0 +1,60 @@ +package org.example; + +import org.jsoup.*; +import java.util.HashSet; +import java.util.LinkedList; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import java.io.File; +public class crawler { + + public static void main(String[] args) throws IOException{ + //Read all lines from a given file into the queue + System.out.println(new File(".").getAbsolutePath()); + + FileReader f_read = new FileReader("crawler_test_file.txt"); + BufferedReader buf_read = new BufferedReader(f_read); + //Initialize web crawler + crawler web_crawler = new crawler(); + String url_line; + while ((url_line = buf_read.readLine()) != null) { + web_crawler.add_to_queue(url_line); + } + web_crawler.crawl(0); + web_crawler.get_visited(); + } + public crawler() { + url_queue = new LinkedList(); + visited_urls = new HashSet(); + } + public void add_to_queue(String url) { + url_queue.add(url); + } + public void crawl(int num_sites) { + //dequeue current website and add it to visited + String cur_site = url_queue.remove(); + visited_urls.add(cur_site); + + /* + - browse through URLS using jsoup + - properly interpret robots.txt + - compress + store text metadata + */ + + while ((!url_queue.isEmpty())) { + crawl(++num_sites); + } + } + + public void get_visited() { + System.out.println("All of the visited websites:"); + for(String url: visited_urls) { + System.out.println(url); + } + } + //members + private final LinkedList url_queue; + private final HashSet visited_urls; +} diff --git a/Crawler/target/classes/org/example/crawler.class b/Crawler/target/classes/org/example/crawler.class new file mode 100644 index 0000000000000000000000000000000000000000..09db5d9476171e5d22d4497c78b8a11a6d51bf68 Binary files /dev/null and b/Crawler/target/classes/org/example/crawler.class differ