Loading src/main/java/Debug.java +1 −0 Original line number Diff line number Diff line Loading @@ -3,4 +3,5 @@ public class Debug { // Set to true to enable debug messages. Set to false to disable debug messages public static boolean DEBUG = true; public static boolean DEBUG_RobotsTXT = false; } src/main/java/RegexParser.java +11 −6 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ import java.util.regex.Pattern; public class RegexParser { public static void main(String[] args) { /*public static void main(String[] args) { // Read URLs from a text file List<String> urls = readUrlsFromFile("urls.txt"); List<String> links = new ArrayList<String>(); Loading Loading @@ -39,7 +39,7 @@ public class RegexParser { e.printStackTrace(); } } } }*/ // Function to read URLs from a text file private static List<String> readUrlsFromFile(String filename) { Loading Loading @@ -72,11 +72,16 @@ public class RegexParser { } // Function to extract links from the page private static List<String> extractLinks(Document doc) { public static List<String> extractLinks(String text) { List<String> links = new ArrayList<>(); Elements elements = doc.select("a[href]"); for (Element element : elements) { links.add(element.attr("href")); // Regular expression pattern to match URLs String urlPattern = "(https?://\\S+|www\\.\\S+)"; Pattern pattern = Pattern.compile(urlPattern); Matcher matcher = pattern.matcher(text); // Find all matches of URLs in the text while (matcher.find()) { links.add(matcher.group()); } return links; } Loading src/main/java/ScratchCrawler.java +43 −11 Original line number Diff line number Diff line import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; Loading @@ -6,11 +7,12 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.Iterator; import java.util.List; import java.io.BufferedReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; public class ScratchCrawler { public static final int MAX_PAGES = 100; // Maximum pages to crawl public static long waitTime = 200; // Time to wait between requests in milliseconds Loading @@ -21,6 +23,7 @@ public class ScratchCrawler { // Also we do not care about the order of the pages. public Set<String> pagesVisited = new HashSet<String>(); // Set to store visited pages public Set<String> pagesToVisit = new HashSet<String>(); // Set to store pages to visit public Set<String> disallowedDomains = new HashSet<String>(); // Set to store disallowed domains // In order to store the robots.txt restrictions, we are going to use a HashMap with the domain // as the key and an object representing the restrictions as the value. This is the best data Loading Loading @@ -49,14 +52,36 @@ public class ScratchCrawler { if(Debug.DEBUG) System.out.println("Getting page: " + url); // Print message if (url.endsWith(")")) { url = url.substring(0, url.length() - 1); } try { URL pageURL = new URL(url); // Create a new URL object String domain = extractDomain(url); // Extract the domain from the URL HttpURLConnection connection = (HttpURLConnection) pageURL.openConnection(); // Set the User-Agent header connection.setRequestProperty("User-Agent", "Mozilla/5.0"); int responseCode = connection.getResponseCode(); if (responseCode != HttpURLConnection.HTTP_OK) { System.out.println("Error reading page. Response code: " + responseCode); // If the page is not found, add the domain to the disallowedDomains set disallowedDomains.add(domain); if (Debug.DEBUG) System.out.println("Adding domain to disallowedDomains: " + domain); // Print message return; } // Check if url is allowed by robots.txt if (isInVisitedRobotsTxt(url)) { // If the URL is in the visited robots.txt RobotsTXT robotsTXT = visitedRobotsTXTs.get(url); // Get the RobotsTXT object for the URL if (robotsTXT.getDisallowedPaths().contains(url)) { // If the URL is disallowed System.out.println("URL is disallowed by robots.txt: " + url); // Print message if (isInVisitedRobotsTxt(domain)) { // If the URL is in the visited robots.txt RobotsTXT robotsTXT = visitedRobotsTXTs.get(domain); // Get the RobotsTXT object for the URL if (robotsTXT.getDisallowedPaths().contains(url) || disallowedDomains.contains(domain)) { // If the URL is disallowed System.out.println("URL is disallowed"); // Print message return; // Exit the method } else { // URL is allowed Loading @@ -77,11 +102,19 @@ public class ScratchCrawler { // Code to read the page BufferedReader reader = new BufferedReader(new InputStreamReader(pageURL.openStream())); // Create a new BufferReader object PrintWriter writer = new PrintWriter("src/main/resources/crawledData.txt"); // Create a new PrintWriter object PrintWriter writer = new PrintWriter(new FileWriter("src/main/resources/crawledData.txt",true)); // Create a new PrintWriter object String line; // Declare a string to store each line of the page while ((line = reader.readLine()) != null) { // While there are lines to read writer.println(line); // Write the line to the file // Extract links from the page List<String> links = RegexParser.extractLinks(line); // Extract the links from the line for (String link : links) { // For each link if (!pagesVisited.contains(link)) { // If the link has not been visited pagesToVisit.add(link); // Add the link to pagesToVisit } } } writer.close(); // Close the writer reader.close(); // Close the reader Loading Loading @@ -136,20 +169,20 @@ public class ScratchCrawler { // parse Disallow robotsTXT.addDisallowedPath(line.substring(10)); // Add the disallowed path to the RobotsTXT object if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } else if (line.startsWith("Allow: ")) { // parse Allow robotsTXT.addAllowedPath(line.substring(7)); // Add the allowed path to the RobotsTXT object if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } else if (line.startsWith("Crawl-delay: ")) { // parse Crawl-delay int delay = Integer.parseInt(line.substring(13)); // Parse the crawl delay robotsTXT.setCrawlDelay(delay); // Set the crawl delay if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } } Loading Loading @@ -252,7 +285,6 @@ public class ScratchCrawler { try { Thread.sleep(waitTime); // Wait to be polite getPage(nextPage); // Get the page //extractURLS(nextPage); // Parse the page } catch (InterruptedException e) { System.out.println("Error waiting between crawling pages."); e.printStackTrace(); Loading @@ -267,7 +299,7 @@ public class ScratchCrawler { public static void main(String[] args) { // Test the getNextPage method ScratchCrawler crawler = new ScratchCrawler(); // Create a new ScratchCrawler object crawler.crawl("https://en.wikipedia.com/"); // Start off the crawl with the seed page crawler.crawl("https://archive.org/details/bostonpubliclibrary"); // Start off the crawl with the seed page // String myURL = "https://wikipedia.org/"; // Set the URL to test // parseRobotsTXT(myURL); // Test the parseRobotsTXT method Loading src/main/resources/crawledData.txt +2.59 MiB (2.69 MiB) File changed.No diff preview for this file type. View original file View changed file target/classes/Debug.class +44 B (376 B) File changed.No diff preview for this file type. View original file View changed file Loading
src/main/java/Debug.java +1 −0 Original line number Diff line number Diff line Loading @@ -3,4 +3,5 @@ public class Debug { // Set to true to enable debug messages. Set to false to disable debug messages public static boolean DEBUG = true; public static boolean DEBUG_RobotsTXT = false; }
src/main/java/RegexParser.java +11 −6 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ import java.util.regex.Pattern; public class RegexParser { public static void main(String[] args) { /*public static void main(String[] args) { // Read URLs from a text file List<String> urls = readUrlsFromFile("urls.txt"); List<String> links = new ArrayList<String>(); Loading Loading @@ -39,7 +39,7 @@ public class RegexParser { e.printStackTrace(); } } } }*/ // Function to read URLs from a text file private static List<String> readUrlsFromFile(String filename) { Loading Loading @@ -72,11 +72,16 @@ public class RegexParser { } // Function to extract links from the page private static List<String> extractLinks(Document doc) { public static List<String> extractLinks(String text) { List<String> links = new ArrayList<>(); Elements elements = doc.select("a[href]"); for (Element element : elements) { links.add(element.attr("href")); // Regular expression pattern to match URLs String urlPattern = "(https?://\\S+|www\\.\\S+)"; Pattern pattern = Pattern.compile(urlPattern); Matcher matcher = pattern.matcher(text); // Find all matches of URLs in the text while (matcher.find()) { links.add(matcher.group()); } return links; } Loading
src/main/java/ScratchCrawler.java +43 −11 Original line number Diff line number Diff line import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; Loading @@ -6,11 +7,12 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.Iterator; import java.util.List; import java.io.BufferedReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; public class ScratchCrawler { public static final int MAX_PAGES = 100; // Maximum pages to crawl public static long waitTime = 200; // Time to wait between requests in milliseconds Loading @@ -21,6 +23,7 @@ public class ScratchCrawler { // Also we do not care about the order of the pages. public Set<String> pagesVisited = new HashSet<String>(); // Set to store visited pages public Set<String> pagesToVisit = new HashSet<String>(); // Set to store pages to visit public Set<String> disallowedDomains = new HashSet<String>(); // Set to store disallowed domains // In order to store the robots.txt restrictions, we are going to use a HashMap with the domain // as the key and an object representing the restrictions as the value. This is the best data Loading Loading @@ -49,14 +52,36 @@ public class ScratchCrawler { if(Debug.DEBUG) System.out.println("Getting page: " + url); // Print message if (url.endsWith(")")) { url = url.substring(0, url.length() - 1); } try { URL pageURL = new URL(url); // Create a new URL object String domain = extractDomain(url); // Extract the domain from the URL HttpURLConnection connection = (HttpURLConnection) pageURL.openConnection(); // Set the User-Agent header connection.setRequestProperty("User-Agent", "Mozilla/5.0"); int responseCode = connection.getResponseCode(); if (responseCode != HttpURLConnection.HTTP_OK) { System.out.println("Error reading page. Response code: " + responseCode); // If the page is not found, add the domain to the disallowedDomains set disallowedDomains.add(domain); if (Debug.DEBUG) System.out.println("Adding domain to disallowedDomains: " + domain); // Print message return; } // Check if url is allowed by robots.txt if (isInVisitedRobotsTxt(url)) { // If the URL is in the visited robots.txt RobotsTXT robotsTXT = visitedRobotsTXTs.get(url); // Get the RobotsTXT object for the URL if (robotsTXT.getDisallowedPaths().contains(url)) { // If the URL is disallowed System.out.println("URL is disallowed by robots.txt: " + url); // Print message if (isInVisitedRobotsTxt(domain)) { // If the URL is in the visited robots.txt RobotsTXT robotsTXT = visitedRobotsTXTs.get(domain); // Get the RobotsTXT object for the URL if (robotsTXT.getDisallowedPaths().contains(url) || disallowedDomains.contains(domain)) { // If the URL is disallowed System.out.println("URL is disallowed"); // Print message return; // Exit the method } else { // URL is allowed Loading @@ -77,11 +102,19 @@ public class ScratchCrawler { // Code to read the page BufferedReader reader = new BufferedReader(new InputStreamReader(pageURL.openStream())); // Create a new BufferReader object PrintWriter writer = new PrintWriter("src/main/resources/crawledData.txt"); // Create a new PrintWriter object PrintWriter writer = new PrintWriter(new FileWriter("src/main/resources/crawledData.txt",true)); // Create a new PrintWriter object String line; // Declare a string to store each line of the page while ((line = reader.readLine()) != null) { // While there are lines to read writer.println(line); // Write the line to the file // Extract links from the page List<String> links = RegexParser.extractLinks(line); // Extract the links from the line for (String link : links) { // For each link if (!pagesVisited.contains(link)) { // If the link has not been visited pagesToVisit.add(link); // Add the link to pagesToVisit } } } writer.close(); // Close the writer reader.close(); // Close the reader Loading Loading @@ -136,20 +169,20 @@ public class ScratchCrawler { // parse Disallow robotsTXT.addDisallowedPath(line.substring(10)); // Add the disallowed path to the RobotsTXT object if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } else if (line.startsWith("Allow: ")) { // parse Allow robotsTXT.addAllowedPath(line.substring(7)); // Add the allowed path to the RobotsTXT object if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } else if (line.startsWith("Crawl-delay: ")) { // parse Crawl-delay int delay = Integer.parseInt(line.substring(13)); // Parse the crawl delay robotsTXT.setCrawlDelay(delay); // Set the crawl delay if (Debug.DEBUG) if (Debug.DEBUG_RobotsTXT) System.out.println(line); // Print message } } Loading Loading @@ -252,7 +285,6 @@ public class ScratchCrawler { try { Thread.sleep(waitTime); // Wait to be polite getPage(nextPage); // Get the page //extractURLS(nextPage); // Parse the page } catch (InterruptedException e) { System.out.println("Error waiting between crawling pages."); e.printStackTrace(); Loading @@ -267,7 +299,7 @@ public class ScratchCrawler { public static void main(String[] args) { // Test the getNextPage method ScratchCrawler crawler = new ScratchCrawler(); // Create a new ScratchCrawler object crawler.crawl("https://en.wikipedia.com/"); // Start off the crawl with the seed page crawler.crawl("https://archive.org/details/bostonpubliclibrary"); // Start off the crawl with the seed page // String myURL = "https://wikipedia.org/"; // Set the URL to test // parseRobotsTXT(myURL); // Test the parseRobotsTXT method Loading
src/main/resources/crawledData.txt +2.59 MiB (2.69 MiB) File changed.No diff preview for this file type. View original file View changed file
target/classes/Debug.class +44 B (376 B) File changed.No diff preview for this file type. View original file View changed file