Loading .gitignore +3 −1 Original line number Diff line number Diff line Loading @@ -6,3 +6,5 @@ CheckerCorrector/test.* CheckerCorrector/*.txt CheckerCorrector/*db !CheckerCorrector/manifest* Crawler/target/classes/ScratchCrawler.class Crawler/target/classes/crawledData.txt No newline at end of file Crawler/src/main/java/RegexParser.class +228 B (3.31 KiB) File changed.No diff preview for this file type. View original file View changed file Crawler/src/main/java/RegexParser.java +8 −4 Original line number Diff line number Diff line Loading @@ -30,17 +30,21 @@ public class RegexParser { return sentences; } // Function to extract links from the page public static List<String> extractLinks(String text) { List<String> links = new ArrayList<>(); // Regular expression pattern to match URLs String urlPattern = "(https?://\\S+|www\\.\\S+)"; String urlPattern = "\\b(https?://|-//|www\\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]"; Pattern pattern = Pattern.compile(urlPattern); Matcher matcher = pattern.matcher(text); // Find all matches of URLs in the text while (matcher.find()) { links.add(matcher.group()); String url = matcher.group(); // Trim trailing quotation mark if present if (url.endsWith("\"")) { url = url.substring(0, url.length() - 1); } links.add(url); } return links; } Loading Crawler/src/main/java/ScratchCrawler.class −36 B (9.63 KiB) File changed.No diff preview for this file type. View original file View changed file Crawler/src/main/java/ScratchCrawler.java +14 −8 Original line number Diff line number Diff line Loading @@ -64,9 +64,10 @@ public class ScratchCrawler { // Provide real-time status and statistics feedback for the crawler System.out.println("Processing URL: " + url); if (url.endsWith(")")) { url = url.substring(0, url.length() - 1); } // REMOVE ME - fixing RegexParser.extractLinks() to handle URLs ending with ')' // if (url.endsWith(")")) { // url = url.substring(0, url.length() - 1); // } try { URL pageURL = new URL(url); // Create a new URL object Loading Loading @@ -228,9 +229,13 @@ public class ScratchCrawler { e.printStackTrace(); return; // Exit the method } catch (IOException e) { if (Debug.DEBUG_RobotsTXT) { System.out.println("Error fetching robots.txt file."); e.printStackTrace(); return; // Exit the method } // If the robots.txt file is not found, add the domain to the disallowedDomains set disallowedDomains.add(domain); } // Store the RobotsTXT object in the visitedRobotsTXTs map Loading Loading @@ -364,10 +369,12 @@ public class ScratchCrawler { if ("--file".equals(args[0]) && args.length == 2) { String filePath = args[1]; crawler.readURLsFromFile(filePath); crawler.crawl(); // Start off the crawl with the seed pages } else if("--seed".equals(args[0]) && args.length == 2) { String seed = args[1]; crawler.crawl(seed); crawler.crawl(seed); // Start off the crawl with the seed page } else if("--help".equals(args[0])) { System.out.println("Usage: java ScratchCrawler [--file <file_path>] or [--seed <seed_url>] or [--help]"); Loading @@ -379,6 +386,5 @@ public class ScratchCrawler { System.out.println("Invalid arguments. Use --help for usage information."); } crawler.crawl(); // Start off the crawl with the seed pages } } Loading
.gitignore +3 −1 Original line number Diff line number Diff line Loading @@ -6,3 +6,5 @@ CheckerCorrector/test.* CheckerCorrector/*.txt CheckerCorrector/*db !CheckerCorrector/manifest* Crawler/target/classes/ScratchCrawler.class Crawler/target/classes/crawledData.txt No newline at end of file
Crawler/src/main/java/RegexParser.class +228 B (3.31 KiB) File changed.No diff preview for this file type. View original file View changed file
Crawler/src/main/java/RegexParser.java +8 −4 Original line number Diff line number Diff line Loading @@ -30,17 +30,21 @@ public class RegexParser { return sentences; } // Function to extract links from the page public static List<String> extractLinks(String text) { List<String> links = new ArrayList<>(); // Regular expression pattern to match URLs String urlPattern = "(https?://\\S+|www\\.\\S+)"; String urlPattern = "\\b(https?://|-//|www\\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]"; Pattern pattern = Pattern.compile(urlPattern); Matcher matcher = pattern.matcher(text); // Find all matches of URLs in the text while (matcher.find()) { links.add(matcher.group()); String url = matcher.group(); // Trim trailing quotation mark if present if (url.endsWith("\"")) { url = url.substring(0, url.length() - 1); } links.add(url); } return links; } Loading
Crawler/src/main/java/ScratchCrawler.class −36 B (9.63 KiB) File changed.No diff preview for this file type. View original file View changed file
Crawler/src/main/java/ScratchCrawler.java +14 −8 Original line number Diff line number Diff line Loading @@ -64,9 +64,10 @@ public class ScratchCrawler { // Provide real-time status and statistics feedback for the crawler System.out.println("Processing URL: " + url); if (url.endsWith(")")) { url = url.substring(0, url.length() - 1); } // REMOVE ME - fixing RegexParser.extractLinks() to handle URLs ending with ')' // if (url.endsWith(")")) { // url = url.substring(0, url.length() - 1); // } try { URL pageURL = new URL(url); // Create a new URL object Loading Loading @@ -228,9 +229,13 @@ public class ScratchCrawler { e.printStackTrace(); return; // Exit the method } catch (IOException e) { if (Debug.DEBUG_RobotsTXT) { System.out.println("Error fetching robots.txt file."); e.printStackTrace(); return; // Exit the method } // If the robots.txt file is not found, add the domain to the disallowedDomains set disallowedDomains.add(domain); } // Store the RobotsTXT object in the visitedRobotsTXTs map Loading Loading @@ -364,10 +369,12 @@ public class ScratchCrawler { if ("--file".equals(args[0]) && args.length == 2) { String filePath = args[1]; crawler.readURLsFromFile(filePath); crawler.crawl(); // Start off the crawl with the seed pages } else if("--seed".equals(args[0]) && args.length == 2) { String seed = args[1]; crawler.crawl(seed); crawler.crawl(seed); // Start off the crawl with the seed page } else if("--help".equals(args[0])) { System.out.println("Usage: java ScratchCrawler [--file <file_path>] or [--seed <seed_url>] or [--help]"); Loading @@ -379,6 +386,5 @@ public class ScratchCrawler { System.out.println("Invalid arguments. Use --help for usage information."); } crawler.crawl(); // Start off the crawl with the seed pages } }