Commit c851dceb authored by Tejas Thakur Singh's avatar Tejas Thakur Singh
Browse files

Merge branch '15-social-media' into 'master'

Social Media

Closes #15

See merge request ec504/ec504_projects/group7!9
parents a3e879ad 0c48e5ec
Loading
Loading
Loading
Loading
+66 −24
Original line number Diff line number Diff line
@@ -7,8 +7,7 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.*;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import java.io.BufferedReader;
@@ -17,9 +16,6 @@ import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;

import java.lang.Math;

@@ -39,9 +35,17 @@ public class crawler {
                web_crawler.debug = true;
            } else if (args[i].equals("--build")) {
                web_crawler.build_off_corpus = true;
            } else if (args[i].equals("--social")) {
                web_crawler.is_username = true;
                file_url = args[i+1]; // Reuse variable to hold username
            }
        } 
        if (!web_crawler.build_off_corpus) {
            if (web_crawler.is_username) {
                String user_url = "https://www.reddit.com/user/" + file_url + "/"; // Convert username into link to user page
                //System.out.println(file_url);
                web_crawler.add_to_queue(user_url);
            } else {
                try (FileReader f_read = new FileReader(file_url);
                     BufferedReader buf_read = new BufferedReader(f_read)) {
                    String url_line;
@@ -50,15 +54,23 @@ public class crawler {
                    }
                }
            }
        } 

        // Start crawling
        final int crawlLimit = 2; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        final int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);

        //Print url queue
        //web_crawler.get_queue();

        // Print visited URLs
        web_crawler.get_visited();
    }

//    public void get_queue(){
//        for (String url : url_queue) {
//            System.out.println(url);
//        }
//    }
    //members
    private final LinkedList<String> url_queue;
    private final HashSet<String> visited_urls;
@@ -67,6 +79,7 @@ public class crawler {
    private int compression_size = 0;
    private boolean debug = false; // flag that outputs uncompressed json showing the trie
    private boolean build_off_corpus = false;
    private boolean is_username  = false; // flag for provided (reddit) username
    private static final int MAXNGRAM = 3;

    public crawler() {
@@ -87,10 +100,15 @@ public class crawler {
        int pageCount = 0;
        if (build_off_corpus) {
            System.out.println("Building off corpus...");
            processPage(get_file_text("brown.txt")); // TODO: extend to multiple files i.e. one per language
            processPage(get_file_text("brown.txt"), false); // TODO: extend to multiple files i.e. one per language
        }
        while (!url_queue.isEmpty() && pageCount < maxPages) {
            String cur_site = url_queue.poll();

            // First 25 characters of url must match in order to be a reddit post
            String postCheck = "https://www.reddit.com/r/";
            boolean isPost = cur_site.contains(postCheck);

            if (cur_site == null || visited_urls.contains(cur_site)) {
                continue;
            }
@@ -98,7 +116,7 @@ public class crawler {
                System.out.println("Processing: " + cur_site);
                Document web_data = get_web_data(cur_site);
                if (web_data != null) {
                    processPage(web_data);
                    processPage(web_data, isPost);
                    visited_urls.add(cur_site);
                    pageCount++;
                }
@@ -144,7 +162,7 @@ public class crawler {
        }
    }

    private void processPage(Document web_data) {
    private void processPage(Document web_data, boolean isPost) {
        byte[] compressedData = new byte[0];
        byte[] uncompressedData = new byte[0];
        boolean sizeLimitExceeded = false;
@@ -153,6 +171,18 @@ public class crawler {
        long startTime = System.nanoTime(),
                endTime;

        if(is_username) { // If a reddit username was passed to the command line
            is_username = false;
            Elements posts = web_data.select("shreddit-profile-comment[href]"); // Get posts from user profile
            // Add all posts in overview to url_queue
            for (Element link: posts) {
                String link_url = link.attr("href");
                if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                    foundLinks++;
                    url_queue.add(link_url);
                }
            }
        } else {
            Elements links = web_data.select("a[href]");
            for (Element link : links) {
                String link_url = link.attr("abs:href");
@@ -163,6 +193,15 @@ public class crawler {
                    url_queue.add(link_url);
                }
            }
        }

        //Ignore for now - optimizes body + comment content retrieval
//        if(isPost) {
                // Get proper tags + queries
//            Element block = web_data.selectFirst("shreddit-post.block.xs:mt-xs.xs:-mx-xs.xs:px-xs.xs:rounded-[16px].pt-xs.nd:pt-xs.bg-[color:var(--shreddit-content-background)].box-border.mb-xs.nd:visible.nd:pb-2xl");
//            String body = block.select("div.text-neutral-content.text-body").text();
//            System.out.println(body);
//        }

        if(!build_off_corpus) {
            // Break the page text into manageable chunks, considering sentences
@@ -171,8 +210,11 @@ public class crawler {
                System.out.println("No text found on page.");
                return;
            }
            //System.out.println(chunks.size());

            for (String chunk : chunks) {
                //chunkCount++;
                //System.out.println(chunk);
                // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
                previousUncompressedData = uncompressedData.clone();
                extractWordUsage(chunk, wordUsage);