Commit 6e067c21 authored by Alexander Ross Melnick's avatar Alexander Ross Melnick
Browse files

Fixed URL parsing

parent 81d499b7
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -6,3 +6,5 @@ CheckerCorrector/test.*
CheckerCorrector/*.txt
CheckerCorrector/*db
!CheckerCorrector/manifest*
Crawler/target/classes/ScratchCrawler.class
Crawler/target/classes/crawledData.txt
 No newline at end of file
+228 B (3.31 KiB)

File changed.

No diff preview for this file type.

+8 −4
Original line number Diff line number Diff line
@@ -30,17 +30,21 @@ public class RegexParser {
        return sentences;
    }

    // Function to extract links from the page
    public static List<String> extractLinks(String text) {
        List<String> links = new ArrayList<>();
        // Regular expression pattern to match URLs
        String urlPattern = "(https?://\\S+|www\\.\\S+)";
        String urlPattern = "\\b(https?://|-//|www\\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]";
        Pattern pattern = Pattern.compile(urlPattern);
        Matcher matcher = pattern.matcher(text);
    
        // Find all matches of URLs in the text
        while (matcher.find()) {
            links.add(matcher.group());
            String url = matcher.group();
            // Trim trailing quotation mark if present
            if (url.endsWith("\"")) {
                url = url.substring(0, url.length() - 1);
            }
            links.add(url);
        }
        return links;
    }
−36 B (9.63 KiB)

File changed.

No diff preview for this file type.

+14 −8
Original line number Diff line number Diff line
@@ -64,9 +64,10 @@ public class ScratchCrawler {
        // Provide real-time status and statistics feedback for the crawler
        System.out.println("Processing URL: " + url); 

        if (url.endsWith(")")) {
            url = url.substring(0, url.length() - 1);
        }
        // REMOVE ME - fixing RegexParser.extractLinks() to handle URLs ending with ')'
        // if (url.endsWith(")")) {
        //     url = url.substring(0, url.length() - 1);
        // }
        
        try {
            URL pageURL = new URL(url); // Create a new URL object
@@ -228,9 +229,13 @@ public class ScratchCrawler {
            e.printStackTrace();
            return; // Exit the method
        } catch (IOException e) {
            if (Debug.DEBUG_RobotsTXT) {
                System.out.println("Error fetching robots.txt file.");
                e.printStackTrace();
            return; // Exit the method
            }
            
            // If the robots.txt file is not found, add the domain to the disallowedDomains set
            disallowedDomains.add(domain);
        }

        // Store the RobotsTXT object in the visitedRobotsTXTs map
@@ -364,10 +369,12 @@ public class ScratchCrawler {
        if ("--file".equals(args[0]) && args.length == 2) {
            String filePath = args[1];
            crawler.readURLsFromFile(filePath);
            crawler.crawl(); // Start off the crawl with the seed pages

        }
        else if("--seed".equals(args[0]) && args.length == 2) {
            String seed = args[1];
            crawler.crawl(seed);
            crawler.crawl(seed); // Start off the crawl with the seed page
        }
        else if("--help".equals(args[0])) {
            System.out.println("Usage: java ScratchCrawler [--file <file_path>] or [--seed <seed_url>] or [--help]");
@@ -379,6 +386,5 @@ public class ScratchCrawler {
            System.out.println("Invalid arguments. Use --help for usage information.");
        }

        crawler.crawl(); // Start off the crawl with the seed pages
    }
}
Loading