From cf35eb49ab8b921407e376c2c014160b8f35b357 Mon Sep 17 00:00:00 2001 From: Tejas Thakur Singh Date: Sun, 31 Mar 2024 01:06:19 -0400 Subject: [PATCH] added jsoup --- .idea/.gitignore | 3 + .idea/compiler.xml | 13 ++ .idea/encodings.xml | 7 + .idea/group7.iml | 9 ++ .idea/jarRepositories.xml | 20 +++ .idea/misc.xml | 14 ++ .idea/modules.xml | 8 ++ .idea/uiDesigner.xml | 124 ++++++++++++++++++ .idea/vcs.xml | 6 + Crawler/pom.xml | 25 ++++ .../src/main/java/org/example/crawler.java | 60 +++++++++ .../target/classes/org/example/crawler.class | Bin 0 -> 2173 bytes 12 files changed, 289 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/compiler.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/group7.iml create mode 100644 .idea/jarRepositories.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/uiDesigner.xml create mode 100644 .idea/vcs.xml create mode 100644 Crawler/pom.xml create mode 100644 Crawler/src/main/java/org/example/crawler.java create mode 100644 Crawler/target/classes/org/example/crawler.class diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..d2cebb0 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..b7aa75e --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/group7.iml b/.idea/group7.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/group7.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000..712ab9d --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..905619e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5ce3c5e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..2b63946 --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Crawler/pom.xml b/Crawler/pom.xml new file mode 100644 index 0000000..e1d04f1 --- /dev/null +++ b/Crawler/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + org.example + Crawler + 1.0-SNAPSHOT + + + 18 + 18 + UTF-8 + + + + + org.jsoup + jsoup + 1.15.3 + + + + \ No newline at end of file diff --git a/Crawler/src/main/java/org/example/crawler.java b/Crawler/src/main/java/org/example/crawler.java new file mode 100644 index 0000000..5233e82 --- /dev/null +++ b/Crawler/src/main/java/org/example/crawler.java @@ -0,0 +1,60 @@ +package org.example; + +import org.jsoup.*; +import java.util.HashSet; +import java.util.LinkedList; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import java.io.File; +public class crawler { + + public static void main(String[] args) throws IOException{ + //Read all lines from a given file into the queue + System.out.println(new File(".").getAbsolutePath()); + + FileReader f_read = new FileReader("crawler_test_file.txt"); + BufferedReader buf_read = new BufferedReader(f_read); + //Initialize web crawler + crawler web_crawler = new crawler(); + String url_line; + while ((url_line = buf_read.readLine()) != null) { + web_crawler.add_to_queue(url_line); + } + web_crawler.crawl(0); + web_crawler.get_visited(); + } + public crawler() { + url_queue = new LinkedList(); + visited_urls = new HashSet(); + } + public void add_to_queue(String url) { + url_queue.add(url); + } + public void crawl(int num_sites) { + //dequeue current website and add it to visited + String cur_site = url_queue.remove(); + visited_urls.add(cur_site); + + /* + - browse through URLS using jsoup + - properly interpret robots.txt + - compress + store text metadata + */ + + while ((!url_queue.isEmpty())) { + crawl(++num_sites); + } + } + + public void get_visited() { + System.out.println("All of the visited websites:"); + for(String url: visited_urls) { + System.out.println(url); + } + } + //members + private final LinkedList url_queue; + private final HashSet visited_urls; +} diff --git a/Crawler/target/classes/org/example/crawler.class b/Crawler/target/classes/org/example/crawler.class new file mode 100644 index 0000000000000000000000000000000000000000..09db5d9476171e5d22d4497c78b8a11a6d51bf68 GIT binary patch literal 2173 zcmZ`)ZF3V<6n<_R*fd!vgi;D^p%e>kN)plXVuO`}MT%{C2^I@#+$6W@!X_!3yQTQX z2c7XZ=pW#-j-;cVas1>5e}F&1kB*-m27K=Brf=YMy4icrxzBmdbIv{c$Mau)1u%}s z?TDbwK-9zz>=f8@&sw&!j#aB<^N$)zRt4JXo)XwU?OW`6_PT4=R9?B#s^$#D1k9G@ zvhBzi;sPQQhXK>XF65dbflbr|j3rIfsTnvZkZieVjyI)MmTn9^q*QXPhmLd$ zN;XtsfyRtlQE_y`G;mm;t99<8x3C~xSq{#|(T9GW-cd#mv!XPCkvSPSMnm1LWaWxg zU2pn)@63VW@1PnMK{b5z$)&r`OOW#Idg z;Tz2UEC$<&m@{!6Zws`0t`iWTAVD>@9bqH9Bd~V^?TXb{%u7Oz2T&JHT*A9dCnQ!N zWP=iILJN=Nbok3A-oq72#O$~AO~5`kd`J6THSs>C*b_snA%x)2Y&CAx) z+%9X78zycd&xWxZlhq~l$gj+u7(Ni_op7A~`a-{2l>H&|{)e)twQY=PfZHZM#5`-q z9oJHImkPBy*4Jn%+|MO&7atk;*u*C~SkRDLT*zS2YRt+NrF}m&Vd>POHEys5dAm}x zl;<+Z)F#z7%69(cdorFAII+n&l=y!E5!6 z4tJ8%^^)bxS+1?u;bzowE4sq30Asxt3cA@@uOJlf&c?+g=~&Tgd33LPY`uc?O0X1# z@M7r+^@{tZv2^!38_#*VZi* zLoM1&uZ`d=PR*+p{HEaSn_|e4=lW}bc)sqrC8@Jx_lION8WKl&#ZGeyN08L_3<5=e zRpOeLWqn=dXl{qZ0zE7A_!Cj?09Sv6Sc5U2Y+ps9iH;`rt)Y89d1w{A(`)FPA6dnb znKcZ|ui~}YlTDnMAB+ADGkq}HMC!?lf6{GDq|;5D`hjR|Smg6KBLN*`uou024sz0_ z(ZySG02go&lQ@Ja^uVUqXutyZaG$GQWZ5k0W_5;19#vs$sX~kngIk>NCtf8Xbs^n@??a2@zhTk8);(l_6Azqw;ZJ;13pSu z;2EOu3?4BhmRwKd5bs<1v=J|*@tTH$wDK{UjB$Z*u zByX`#G@~Zwrbm9p9f9v~fG-6BO_na?y6?%0zqS(_MU4G=3}=vHKcsPPU6f<|glj3Z akeQI8u5&`u*<;}DzpRPCm)wfrtN#E$4eF8r literal 0 HcmV?d00001 -- GitLab