Commit 3aa55921 authored by Moises Bensadon's avatar Moises Bensadon
Browse files

fix url strip of anchors and query params

parent a077e7d1
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -146,7 +146,7 @@ public class MainApp extends JFrame {
                List<String> lines = Files.readAllLines(Paths.get(input));
                lines.forEach(webCrawler::add_to_queue);
            } catch (Exception e) {
                resultArea.setText("Error reading file: " + e.getMessage());
                resultArea.setText("Error reading file/link. Please make sure to include http or .txt for link or file respectively: " + e.getMessage());
                return;
            }
        }
@@ -324,7 +324,7 @@ public class MainApp extends JFrame {
            }

            resultArea.setText(result.toString());

            // ISSUE #30 - Feedback for corrector
            // Add a pop up to input the best correction for each sentence
            TrieNode node = corrector.getDetector();
            boolean changeMade = false;
+27 −17
Original line number Diff line number Diff line
@@ -10,6 +10,9 @@ import org.jsoup.select.Elements;
import java.util.*;
import java.util.zip.Deflater;
import java.util.zip.Inflater;

import javax.swing.JProgressBar;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
@@ -56,9 +59,7 @@ public class crawler {
        }
        if (!file_url.isEmpty()) {
            if (web_crawler.is_username) {
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user
                                                                                   // page
                // System.out.println(file_url);
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
                web_crawler.add_to_queue(user_url);
            } else {
                try (FileReader f_read = new FileReader(file_url);
@@ -72,7 +73,7 @@ public class crawler {
        }

        // Start crawling
        final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 5; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);

        // Print url queue
@@ -98,6 +99,7 @@ public class crawler {
    private boolean is_username = false; // flag for provided (reddit) username
    private static final int MAXNGRAM = 3;
    private Consumer<String> outputCallback;
    private JProgressBar progressBar;

    public crawler(String file) {
        url_queue = new LinkedList<>();
@@ -111,8 +113,9 @@ public class crawler {
        // Estimate page count based on compressed file size
    }

    public crawler(String file, Consumer<String> outputCallback) {
    public crawler(String file, Consumer<String> outputCallback, JProgressBar progressBar) {
        this.outputCallback = outputCallback;
        this.progressBar = progressBar;
        url_queue = new LinkedList<>();
        visited_urls = new HashSet<>();

@@ -139,7 +142,7 @@ public class crawler {
            }
            try {
                System.out.println("Processing: " + cur_site);
                outputCallback.accept("\nProcessing: " + cur_site);
                outputCallback.accept("\n\nProcessing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data, isPost);
@@ -156,7 +159,9 @@ public class crawler {

    public void build(String language) {
        String corpus = "";
        if (language.equals("English")) {
        if (language.equals("SmallEnglish")) {
            corpus = "brownSmall.txt";
        } else if (language.equals("English")) {
            corpus = "brown.txt";
        } else if (language.equals("German")) {
            corpus = "germanSmall.txt";
@@ -226,9 +231,14 @@ public class crawler {
            is_username = false;
            Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile
            // Add all posts in overview to url_queue
            for (Element link : posts) {
            for (Element link : posts) { // for links in the text
                String link_url = link.attr("href");
                if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                // Strip the URL of any anchor tags or query tags
                if (link_url.contains("#"))
                    link_url = link_url.split("#")[0];
                if (link_url.contains("?"))
                    link_url = link_url.split("\\?")[0]; // ? is a special character in regex
                if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                    foundLinks++;
                    url_queue.add(link_url);
                }
@@ -237,8 +247,13 @@ public class crawler {
            Elements links = web_data.select("a[href]");
            for (Element link : links) {
                String link_url = link.attr("abs:href");
                // Strip the URL of any anchor tags
                if (link_url.contains("#"))
                    link_url = link_url.split("#")[0];
                if (link_url.contains("?"))
                    link_url = link_url.split("\\?")[0]; // ? is a special character in regex
                // System.out.println("Found link: " + link_url);
                if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                    // System.out.println("Adding link to queue: " + link_url);
                    foundLinks++;
                    url_queue.add(link_url);
@@ -259,7 +274,7 @@ public class crawler {
            // Break the page text into manageable chunks, considering sentences
            List<String> chunks = splitTextIntoChunks(web_data.text());
            if (chunks.isEmpty()) {
                System.out.println("No text found on page.");
                // System.out.println("No text found on page.");
                outputCallback.accept("\nNo text found on page.");
                return;
            }
@@ -344,17 +359,12 @@ public class crawler {

        // Output sizes of both compressed and uncompressed data for reference
        System.out.println("Compressed metadata size: " + compressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes\n");
        System.out.println("Uncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nUncompressed metadata size: " + uncompressedData.length + " bytes");
        outputCallback.accept("\nCompressed metadata size: " + compressedData.length + " bytes, Uncompressed metadata size: " + uncompressedData.length + " bytes");
        // Output rate of processing
        double processingRate = web_data.text().length() / ((endTime - startTime) / 1000000000.0);
        System.out.println("Rate of processing: " + Math.round(processingRate) + " bytes/second");
        outputCallback.accept("\nRate of processing: " + Math.round(processingRate) + " bytes/second");
        // Output # of links found in page
        System.out.println("# of additional links found: " + foundLinks + "\n");
        outputCallback.accept("\n# of additional links found: " + foundLinks + "\n");

        compression_size = compressedData.length;
    }