diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..7b016a89fbafd4b802a61d3207cf76f7c2253c6e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "java.compile.nullAnalysis.mode": "automatic" +} \ No newline at end of file diff --git a/Crawler/src/main/java/RegexParser.class b/Crawler/src/main/java/RegexParser.class index 207d35936d0cf7df40e19ea52777372099768bee..2e08aeb4d7f880525fca08562e5bec97cad59d13 100644 Binary files a/Crawler/src/main/java/RegexParser.class and b/Crawler/src/main/java/RegexParser.class differ diff --git a/Crawler/src/main/java/ScratchCrawler.class b/Crawler/src/main/java/ScratchCrawler.class index 438336352d22fb39aaa2ee1ec67dc254b1fb86c7..474cfed93c54c897aa941ee965c00c2d69e76b8c 100644 Binary files a/Crawler/src/main/java/ScratchCrawler.class and b/Crawler/src/main/java/ScratchCrawler.class differ diff --git a/Crawler/src/main/java/ScratchCrawler.java b/Crawler/src/main/java/ScratchCrawler.java index 750c1c482748a2e674692bcb8d2e84c0fab296ed..d3224e8db2f5c936751e5dfc710628ea9ef9d45f 100644 --- a/Crawler/src/main/java/ScratchCrawler.java +++ b/Crawler/src/main/java/ScratchCrawler.java @@ -13,21 +13,28 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; +import java.util.stream.Collectors; import java.util.stream.Stream; import java.nio.file.Files; import java.nio.file.Paths; public class ScratchCrawler { public static final int MAX_PAGES = 100; // Maximum pages to crawl + public static long waitTime = 200; // Time to wait between requests in milliseconds + public static int totalSize = 0; // Total size of pages visited in bytes + public static double processingRatePages = 0; // Processing rate in pages per second (inverse waitTime) + public static double processingRateLinks = 0; // Processing rate in links per second + public static double processingRateSize = 0; // Processing rate in bytes per second + // Using a HashSet to store visited pages and pages to visit. This is the best data structure // for this use case because it has O(1) time complexity for add, remove, and contains // operations and enforces uniqueness (we do not want to crawl pages more than once). // Also we do not care about the order of the pages. - public Set pagesVisited = new HashSet(); // Set to store visited pages - public Set pagesToVisit = new HashSet(); // Set to store pages to visit - public Set disallowedDomains = new HashSet(); // Set to store disallowed domains + public static Set pagesVisited = new HashSet(); // Set to store visited pages + public static Set pagesToVisit = new HashSet(); // Set to store pages to visit + public static Set disallowedDomains = new HashSet(); // Set to store disallowed domains // In order to store the robots.txt restrictions, we are going to use a HashMap with the domain // as the key and an object representing the restrictions as the value. This is the best data @@ -51,9 +58,11 @@ public class ScratchCrawler { } // https://docs.oracle.com/javase/tutorial/networking/urls/index.html - public void getPage(String url) { + public static void getPage(String url) { // Code to get the page - System.out.println("Getting page: " + url); // Print message + + // Provide real-time status and statistics feedback for the crawler + System.out.println("Processing URL: " + url); if (url.endsWith(")")) { url = url.substring(0, url.length() - 1); @@ -110,7 +119,7 @@ public class ScratchCrawler { String line; // Declare a string to store each line of the page StringBuilder pageContent = new StringBuilder(); // To store the page content - + int linksExtracted = 0; // Number of links extracted from the page while ((line = reader.readLine()) != null && pageContent.length() < 1024) { // While there are lines to read pageContent.append(line); // Add the line to the page content @@ -119,12 +128,27 @@ public class ScratchCrawler { for (String link : links) { // For each link if (!pagesVisited.contains(link)) { // If the link has not been visited pagesToVisit.add(link); // Add the link to pagesToVisit + linksExtracted++; // Increment the number of links extracted } } } // Write the page content to the file, up to 1KB writer.println(pageContent.toString().substring(0, Math.min(1024, pageContent.length()))); + // Provide real-time status and statistics feedback for the crawler + totalSize += pageContent.length(); + System.out.println("Length of page processed [Bytes]: " + pageContent.length()); + System.out.println("Total size of pages visited [Bytes]: " + totalSize); + System.out.println("Number of links extracted: " + linksExtracted); + System.out.println("Number of pages crawled (" + MAX_PAGES + " pages max): " + pagesVisited.size()); + System.out.println("URLs available to crawl: " + pagesToVisit.size()); + processingRatePages = 1000 / (double)waitTime; // Processing rate in pages per second (inverse waitTime) + processingRateLinks = linksExtracted * processingRatePages; // Processing rate in links per second + processingRateSize = pageContent.length() * processingRatePages; // Processing rate in bytes per second + System.out.println("Processing rate in pages per second: " + processingRatePages); + System.out.println("Processing rate in links per second: " + processingRateLinks); + System.out.println("Processing rate in bytes per second: " + processingRateSize); + writer.close(); // Close the writer reader.close(); // Close the reader @@ -337,11 +361,22 @@ public class ScratchCrawler { //crawler.crawl("https://archive.org/details/bostonpubliclibrary"); // Start off the crawl with the seed page // Parse command-line arguments - for (int i = 0; i < args.length; i++) { - if ("--file".equals(args[i]) && i + 1 < args.length) { - String filePath = args[i + 1]; - crawler.readURLsFromFile(filePath); - } + if ("--file".equals(args[0]) && args.length == 2) { + String filePath = args[1]; + crawler.readURLsFromFile(filePath); + } + else if("--seed".equals(args[0]) && args.length == 2) { + String seed = args[1]; + crawler.crawl(seed); + } + else if("--help".equals(args[0])) { + System.out.println("Usage: java ScratchCrawler [--file ] or [--seed ] or [--help]"); + System.out.println("--file : Read URLs from a file and start crawling"); + System.out.println("--seed : Start crawling from a seed URL"); + System.out.println("--help: Display this help message"); + } + else { + System.out.println("Invalid arguments. Use --help for usage information."); } crawler.crawl(); // Start off the crawl with the seed pages diff --git a/Crawler/src/main/java/crawledData.txt b/Crawler/src/main/java/crawledData.txt index 400ebd2d05cbe2a5950fc7d3aed3eb74c8d4c4b8..4513d749cacf942d0952495e11df4152193e570c 100644 --- a/Crawler/src/main/java/crawledData.txt +++ b/Crawler/src/main/java/crawledData.txt @@ -1,36 +1,8 @@ -Ari's Homepage - under deconstruction
Who Work Pubs StudsInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineAri's Homepage - under deconstruction
Who Work Pubs StudsJava TutorialInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineAri's Homepage - under deconstruction
Who Work Pubs StudsInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineRequest RejectedThe requested URL was rejected. Please consult with your administrator.

Your support ID is: <13194503468879793968>

[Go Back] -Ari's Homepage - under deconstruction
Who Work Pubs StudsInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineRequest RejectedThe requested URL was rejected. Please consult with your administrator.

Your support ID is: <13194503468879866352>

[Go Back] -Ari's Homepage - under deconstruction
Who Work Pubs StudsInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineRequest RejectedThe requested URL was rejected. Please consult with your administrator.

Your support ID is: <13194503468879956624>

[Go Back] - Crawler Test Site Ari's Homepage - under deconstruction
Who Work Pubs StudsInternet Archive: Digital Library of Free & Borrowable Books, Movies, Music & Wayback MachineRequest RejectedThe requested URL was rejected. Please consult with your administrator.

Your support ID is: <13194503468880083824>

[Go Back] - Crawler Test Site Ari's Homepage - under deconstruction