Fixed URL parsing (6e067c21) · Commits · EC504 Spring 2024 Group Projects / Group6

.gitignore

+3 −1

Original line number	Diff line number	Diff line
		@@ -6,3 +6,5 @@ CheckerCorrector/test.*
		CheckerCorrector/*.txt
		CheckerCorrector/*db
		!CheckerCorrector/manifest*
		Crawler/target/classes/ScratchCrawler.class
		Crawler/target/classes/crawledData.txt
		No newline at end of file

Crawler/src/main/java/RegexParser.class

+228 B (3.31 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

Crawler/src/main/java/RegexParser.java

+8 −4

Original line number	Diff line number	Diff line
		@@ -30,17 +30,21 @@ public class RegexParser {
		return sentences;
		}

		// Function to extract links from the page
		public static List<String> extractLinks(String text) {
		List<String> links = new ArrayList<>();
		// Regular expression pattern to match URLs
		String urlPattern = "(https?://\\S+\|www\\.\\S+)";
		String urlPattern = "\\b(https?://\|-//\|www\\.)[-A-Za-z0-9+&@#/%?=~_\|!:,.;]*[-A-Za-z0-9+&@#/%=~_\|]";
		Pattern pattern = Pattern.compile(urlPattern);
		Matcher matcher = pattern.matcher(text);

		// Find all matches of URLs in the text
		while (matcher.find()) {
		links.add(matcher.group());
		String url = matcher.group();
		// Trim trailing quotation mark if present
		if (url.endsWith("\"")) {
		url = url.substring(0, url.length() - 1);
		}
		links.add(url);
		}
		return links;
		}

Crawler/src/main/java/ScratchCrawler.class

−36 B (9.63 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

Crawler/src/main/java/ScratchCrawler.java

+14 −8

Original line number	Diff line number	Diff line
		@@ -64,9 +64,10 @@ public class ScratchCrawler {
		// Provide real-time status and statistics feedback for the crawler
		System.out.println("Processing URL: " + url);

		if (url.endsWith(")")) {
		url = url.substring(0, url.length() - 1);
		}
		// REMOVE ME - fixing RegexParser.extractLinks() to handle URLs ending with ')'
		// if (url.endsWith(")")) {
		// url = url.substring(0, url.length() - 1);
		// }

		try {
		URL pageURL = new URL(url); // Create a new URL object
		@@ -228,9 +229,13 @@ public class ScratchCrawler {
		e.printStackTrace();
		return; // Exit the method
		} catch (IOException e) {
		if (Debug.DEBUG_RobotsTXT) {
		System.out.println("Error fetching robots.txt file.");
		e.printStackTrace();
		return; // Exit the method
		}

		// If the robots.txt file is not found, add the domain to the disallowedDomains set
		disallowedDomains.add(domain);
		}

		// Store the RobotsTXT object in the visitedRobotsTXTs map
		@@ -364,10 +369,12 @@ public class ScratchCrawler {
		if ("--file".equals(args[0]) && args.length == 2) {
		String filePath = args[1];
		crawler.readURLsFromFile(filePath);
		crawler.crawl(); // Start off the crawl with the seed pages

		}
		else if("--seed".equals(args[0]) && args.length == 2) {
		String seed = args[1];
		crawler.crawl(seed);
		crawler.crawl(seed); // Start off the crawl with the seed page
		}
		else if("--help".equals(args[0])) {
		System.out.println("Usage: java ScratchCrawler [--file <file_path>] or [--seed <seed_url>] or [--help]");
		@@ -379,6 +386,5 @@ public class ScratchCrawler {
		System.out.println("Invalid arguments. Use --help for usage information.");
		}

		crawler.crawl(); // Start off the crawl with the seed pages
		}
		}