diff --git a/Crawler/src/main/java/ScratchCrawler.class b/Crawler/src/main/java/ScratchCrawler.class index 255b2164e45d9c3f54f13ecb902aa80e2883e6d2..97a4d46032ca0a6abb63bbcc90dcae8b1f644a56 100644 Binary files a/Crawler/src/main/java/ScratchCrawler.class and b/Crawler/src/main/java/ScratchCrawler.class differ diff --git a/Crawler/src/main/java/ScratchCrawler.java b/Crawler/src/main/java/ScratchCrawler.java index 3b2c450a8ca2a20c5ac31e1f6717e5dfe953d20c..bfa0e6f1b3bf192d8c4dafdea534d2c14a7c4795 100644 --- a/Crawler/src/main/java/ScratchCrawler.java +++ b/Crawler/src/main/java/ScratchCrawler.java @@ -415,6 +415,11 @@ public class ScratchCrawler { crawler.pagesToVisit.add("https://www.tumblr.com/"); startCrawl = true; break; + case "--dutch": + // Extension of our crawler with an English to Dutch translation + crawler.pagesToVisit.add("https://travelwithlanguages.com/blog/most-common-dutch-words.html"); + startCrawl = true; + break; case "--help": System.out.println("Usage: java ScratchCrawler [--file ] or [--seed ] or [--help]"); System.out.println("--file : Read URLs from a file and start crawling"); diff --git a/Crawler/src/main/java/crawledData.txt b/Crawler/src/main/java/crawledData.txt index 04d59b443cb614295366b23decc1a08e29d12398..3235066c0d2c9fe9e3d6abdefac9a75c42c45f9b 100644 --- a/Crawler/src/main/java/crawledData.txt +++ b/Crawler/src/main/java/crawledData.txt @@ -1,43 +1,50 @@ - Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
- Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use - Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
- Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
- Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use - Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

- RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 - Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
-Internet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback Machine

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+ Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use + Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Crawler Test Site

Link alternate media handheld


yLb2Nwr0mmT+FJwNkwkk
+ Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+ Example Domain

Example Domain

This domain is for use in illustrative examples in documents. You may use + Link alternate media print

Link alternate media print


4aibl+gquo9XzDJWu0Va

+ RSS Title This is an example of an RSS feed http://www.example.com/main.html Mon, 06 Sep 2010 00:01:00 +0000 Sun, 06 Sep 2009 16:20:00 +0000 1800 Example entry Here is some text containing an interesting description. http://www.example.com/blog/post/1 7bd204c6-1655-4c27-aeee-53f933c5395f Sun, 06 Sep 2009 16:20:00 +0000 + Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
+Internet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback Machine The 1000 most common Dutch words [Complete List]