From 2367127fded2c097b1fa9df6279b3e9bb408aa2f Mon Sep 17 00:00:00 2001 From: alexrmelnick Date: Fri, 12 Apr 2024 17:50:32 -0400 Subject: [PATCH] initial inquiry into cross-language dictionaries --- Crawler/src/main/java/ScratchCrawler.class | Bin 10823 -> 10968 bytes Crawler/src/main/java/ScratchCrawler.java | 5 ++ Crawler/src/main/java/crawledData.txt | 93 +++++++++++---------- 3 files changed, 55 insertions(+), 43 deletions(-) diff --git a/Crawler/src/main/java/ScratchCrawler.class b/Crawler/src/main/java/ScratchCrawler.class index 255b2164e45d9c3f54f13ecb902aa80e2883e6d2..97a4d46032ca0a6abb63bbcc90dcae8b1f644a56 100644 GIT binary patch delta 1820 zcmY+E32>7|6vyBH%kiaYif!5?4bsv!J)oVo9MVde4j@>O8>9jvV1Pgh2`NcR(Q(KO z2o7?}0&-u26eu7xL(3smP`MNl5CsJkL{Y&TR6xM@Eeb=l!N`EW(1f+1&i{9YgYJnQ zUv)*;>vNAEQ{^7x3CXMxzQCA5e;^dj=Nf-t{JquWR|kTlLIvKi-)F))F+5=)){BOO z&e$MsBotwjXk{(HW-;8_30uW#YrcNF1$E-M)s|BaN*L>&=q{`ZSNICMxkKJ(JYh3- zkj{)<*sWrZ1#e@oh)XOp?Z7)0GT>b?B=H(P7JZWjCx6Q2eS?9Ko=~V_{FpS}cu5*e z_)P3eYKhb0N>VPq5O!M&oE05yc{ne|+0yZ~m~Cr?i=xriT(q-0#5J1@--+`!yGXO! z@V$(B+HGxq1UdVAsDhN*sr;QE-}362oU^xz zzXpow8t}U-#>|is_5dh$L*&R1iZ%wM8W%Fw6lHW0~sHn7>Xqe_dRG2 zeYlNo54UM7jk}+JQ0pTD8JQg75mUGY_J-UB!gdaT!>ruRYUfrtZIP)WwP`;b7L`qN z9zDWc7E*3gu}Uc_C94#JVN|50MeCZbRB$-jFGU9&5#_0`GrOEx$)k!vX|;^>>Ai4B z^iQA5t%2w;{O(8D1yv%!L7Biog zna*5pMr_F(#@v$CmU*YFRF$udwj|-n+Q7U?ww7O(-68TJK9V~kWfSSh#9KbNhfIST zFs-MqG!akt))nb(`rO8~*qCe6w<$3wMMkHg9@A@i-?W0wxbo(5uA|ATjy`dh{qudo zdCl0D^r_NcU_ZCdOZo)%Jq9oI#Ig4~4T+dhkC{toh%QoOtf6v^%#ozKUISe7&VZgfdvvcR1G9(V37nyQ<2eT+{9B6U6RUQlzk3cU$)6?)7TcX<<{Et z`WJY6Q&IVtr{1)zwkUL@cYP)-|+V%+5(Kc}0>Hl1$P^sknwFNxmpHWZ-Xdbc%c!~YY_lb#=G9}H zb_)3nu`Z=-{yGtU1*ec1Q>h82Q!-wqtC&I8@fzL1Oz1HSM$ASG=D>=1uwgz@A&`v) zXoH0j6!U}H2}{rg%kUJIqdQh%5LV+ktie#M!#KPNKh~oX8}K4FVhT3#u(e#i87r`b zd$uy&hMky>M(n_T)Z+j@VcIQdO-Xktfeb2+s=P|R4t#BHQ*SMZ*M4!47R_SQV~tkp z<2R_uP0r-BWA|_00zQD+Z$J{FQJSiY(y=B{I>mN|?JT#8_h1Jf&4wts?nmnVXowc+ z_EO4D>})_|6lHtJ&*}T3bFuaOZ`FW6{$Z-{YC0;QvZ;8UDI6rB^x)S-jaG->Rm| z;pHI6>jR;H6c8wHEWs370Us?OiXezmMG+hY5fl|ssK5^2$obmqw>{j=m5z9dL+ z{vzfZ{N0&1`hCph!UZzNG9Q=OE$axc(@iS1P!1^?nKgY&8lGyRM1@PNx?AW#=2m`S ze3o8lbyn~4qc|qFMao1nkeTx{9wy+&h7!9Vf?OHfZy~uWY;Y`)W_MtH=B8hAx9-{;E<5*BZgAE~5 z6%AlejzyC6RCUr#Pgx>CC!-G1b+AkVw@C*!9W0l?q=igQ({`@R*T?@F-anC_M*)u) zdCcOm!l|m`cn9PT%u%6lLJNAdI9^lxR3%a6UdnNxB2HbZNQgVxzMfMgStChAjZyb1 z8XFw7mox`nKR{Uujp7Gsh|H?wEa_}uM~v>|FRvmy1O2RO!DqynC#`Cpn_59yFpiyi zZ*jM?{r<*3B{SdRy=0sZj?24fS3V>6z>7DP~mN!W@R*oK+df%%w+ z#dwp;Rdbnr&|4^v>@8#?qlOQ-iNIaNAHW?Ns5{AgC1$23_!3O^9=9o}iPJde!`6>z zIT^-r4eo_@*b}4Kh8UepiqTmfjXchC+`SIf++DF4Wwd%)tM|C>84gm%{n%H7+874b z(L_!^7^9-QAEju@xv%^qoN0J^((0tq+XQz vt<>+NUY7cU)E}i@m-@5RUvv%NHyQjc^$)4HrT(QDcu5w5qADiEOrC!L#4E7? diff --git a/Crawler/src/main/java/ScratchCrawler.java b/Crawler/src/main/java/ScratchCrawler.java index 3b2c450..bfa0e6f 100644 --- a/Crawler/src/main/java/ScratchCrawler.java +++ b/Crawler/src/main/java/ScratchCrawler.java @@ -415,6 +415,11 @@ public class ScratchCrawler { crawler.pagesToVisit.add("https://www.tumblr.com/"); startCrawl = true; break; + case "--dutch": + // Extension of our crawler with an English to Dutch translation + crawler.pagesToVisit.add("https://travelwithlanguages.com/blog/most-common-dutch-words.html"); + startCrawl = true; + break; case "--help": System.out.println("Usage: java ScratchCrawler [--file ] or [--seed ] or [--help]"); System.out.println("--file : Read URLs from a file and start crawling"); diff --git a/Crawler/src/main/java/crawledData.txt b/Crawler/src/main/java/crawledData.txt index 04d59b4..3235066 100644 --- a/Crawler/src/main/java/crawledData.txt +++ b/Crawler/src/main/java/crawledData.txt @@ -1,43 +1,50 @@ - Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
- Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use - Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
- Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use - Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
-Internet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback Machine

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+ Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use + Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+ Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use + Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+Internet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback Machine The 1000 most common Dutch words [Complete List]