Commit 3922f1ea authored by Alexander Ross Melnick's avatar Alexander Ross Melnick
Browse files

Added social media crawling (Tumblr)

parent 6e067c21
Loading
Loading
Loading
Loading
+958 B (10.6 KiB)

File changed.

No diff preview for this file type.

+76 −29
Original line number Diff line number Diff line
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
@@ -26,6 +27,7 @@ public class ScratchCrawler {
    public static double processingRatePages = 0; // Processing rate in pages per second (inverse waitTime)
    public static double processingRateLinks = 0; // Processing rate in links per second
    public static double processingRateSize = 0; // Processing rate in bytes per second
    public static boolean printStats = false; // Print stats flag
    

    // Using a HashSet to store visited pages and pages to visit. This is the best data structure 
@@ -35,6 +37,11 @@ public class ScratchCrawler {
    public static Set<String> pagesVisited = new HashSet<String>(); // Set to store visited pages
    public static Set<String> pagesToVisit = new HashSet<String>(); // Set to store pages to visit
    public static Set<String> disallowedDomains = new HashSet<String>(); // Set to store disallowed domains
    public static Set<String> whitelist = new HashSet<String>(Arrays.asList( // Set to store whitelisted domains
        // Add whitelisted domains here   
        //"https://www.usenetarchives.com" // Our social media platform for crawling (approved by Prof. Trachtenberg)
    ));


    // In order to store the robots.txt restrictions, we are going to use a HashMap with the domain 
    // as the key and an object representing the restrictions as the value. This is the best data
@@ -62,7 +69,9 @@ public class ScratchCrawler {
        // Code to get the page

        // Provide real-time status and statistics feedback for the crawler
        if (printStats) {
            System.out.println("Processing URL: " + url); 
        }

        // REMOVE ME - fixing RegexParser.extractLinks() to handle URLs ending with ')'
        // if (url.endsWith(")")) {
@@ -138,17 +147,21 @@ public class ScratchCrawler {

            // Provide real-time status and statistics feedback for the crawler
            totalSize += pageContent.length();
            processingRatePages = 1000 / (double)waitTime; // Processing rate in pages per second (inverse waitTime)
            processingRateLinks = linksExtracted * processingRatePages; // Processing rate in links per second
            processingRateSize = pageContent.length() * processingRatePages; // Processing rate in bytes per second

            if (printStats) {
                System.out.println("Length of page processed [Bytes]: " + pageContent.length());
                System.out.println("Total size of pages visited [Bytes]: " + totalSize);
                System.out.println("Number of links extracted: " + linksExtracted);
                System.out.println("Number of pages crawled  (" + MAX_PAGES + " pages max): " + pagesVisited.size());
                System.out.println("URLs available to crawl: " +  pagesToVisit.size());
            processingRatePages = 1000 / (double)waitTime; // Processing rate in pages per second (inverse waitTime)
            processingRateLinks = linksExtracted * processingRatePages; // Processing rate in links per second
            processingRateSize = pageContent.length() * processingRatePages; // Processing rate in bytes per second
                System.out.println("Processing rate in pages per second: " + processingRatePages);
                System.out.println("Processing rate in links per second: " + processingRateLinks);
                System.out.println("Processing rate in bytes per second: " + processingRateSize);
            }
           

            writer.close(); // Close the writer
            reader.close(); // Close the reader
@@ -248,7 +261,7 @@ public class ScratchCrawler {
            System.out.println("Checking if URL is in visited robots.txt: " + url); // Print message

        // Extract the domain from the URL
        String domain = url; // Set the domain to the URL for now
        String domain; 
        Pattern pattern = Pattern.compile("((http://|https://)?[^:/]+)"); // Create a pattern to match the domain
        Matcher matcher = pattern.matcher(url); // Create a matcher for the pattern
        if (matcher.find()) {
@@ -280,6 +293,11 @@ public class ScratchCrawler {
        // Extract the domain from the URL
        String domain = extractDomain(url); // Set the domain to the URL for now

        // Check if the domain is in the whitelist
        if (whitelist.contains(domain)) {
            return true;
        }
        
        // Check if the domain is in the visitedRobotsTXTs map
        if (visitedRobotsTXTs.containsKey(domain)) { // If the domain is in the map
            RobotsTXT robotsTXT = visitedRobotsTXTs.get(domain); // Get the RobotsTXT object for the domain
@@ -300,7 +318,7 @@ public class ScratchCrawler {

    public static String extractDomain(String url) {
        // Extract the domain from the URL
        String domain = url; // Set the domain to the URL for now
        String domain; 
        Pattern pattern = Pattern.compile("((http://|https://)?[^:/]+)"); // Create a pattern to match the domain
        Matcher matcher = pattern.matcher(url); // Create a matcher for the pattern
        if (matcher.find()) {
@@ -326,7 +344,7 @@ public class ScratchCrawler {
            } catch (InterruptedException e) {
                System.out.println("Error waiting between crawling pages.");
                e.printStackTrace();
            } // 
            } 
        }

        System.out.println("Crawling complete."); // Print message
@@ -365,25 +383,54 @@ public class ScratchCrawler {
        ScratchCrawler crawler = new ScratchCrawler(); // Create a new ScratchCrawler object
        //crawler.crawl("https://archive.org/details/bostonpubliclibrary"); // Start off the crawl with the seed page

        boolean startCrawl = false; // Flag to start the crawl
        String seed = ""; // Seed URL

        // Parse command-line arguments
        if ("--file".equals(args[0]) && args.length == 2) {
            String filePath = args[1];
        for (int i = 0; i < args.length; i++) {
            switch (args[i]) {
                case "--file":
                    if (i + 1 < args.length) {
                        String filePath = args[++i];
                        crawler.readURLsFromFile(filePath);
            crawler.crawl(); // Start off the crawl with the seed pages

        }
        else if("--seed".equals(args[0]) && args.length == 2) {
            String seed = args[1];
            crawler.crawl(seed); // Start off the crawl with the seed page
        }
        else if("--help".equals(args[0])) {
                        startCrawl = true;
                    } else {
                        System.out.println("Missing file path after --file");
                    }
                    break;
                case "--seed":
                    if (i + 1 < args.length) {
                        seed = args[++i];
                        crawler.pagesToVisit.add(seed);
                        startCrawl = true;
                    } else {
                        System.out.println("Missing seed URL after --seed");
                    }
                    break;
                case "--stats":
                    printStats = true;
                    break;
                case "--social": 
                    // Extension of our crawler to crawling social media posts of some large network (in this case Tumblr)
                    crawler.pagesToVisit.add("https://www.tumblr.com/");
                    startCrawl = true;
                    break;
                case "--help":
                    System.out.println("Usage: java ScratchCrawler [--file <file_path>] or [--seed <seed_url>] or [--help]");
                    System.out.println("--file <file_path>: Read URLs from a file and start crawling");
                    System.out.println("--seed <seed_url>: Start crawling from a seed URL");
                    System.out.println("--stats: Print statistics during crawling");
                    System.out.println("--social: Include crawling from the Usenet Archives website");
                    System.out.println("--help: Display this help message");
                    break;
                default:
                    System.out.println("Invalid argument: " + args[i] + ". Use --help for usage information.");
                    break;
            }
        else {
            System.out.println("Invalid arguments. Use --help for usage information.");
        }

        if (startCrawl) {
            crawler.crawl(); // Start the crawl
        }

    }
+13 −0
Original line number Diff line number Diff line
@@ -28,3 +28,16 @@
<!DOCTYPE html><html lang="en"><head><base href="/"><meta charset="utf-8"><link rel="icon" href="/offshoot_assets/favicon.ico"><link rel="preconnect" href="https://analytics.archive.org"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="google-site-verification" content="Q2YSouphkkgHkFNP7FgAkc4TmBs1Gmag3uGNndb53B8"><meta name="google-site-verification" content="bpjKvUvsX0lxfmjg19TLblckWkDpnptZEYsBntApxUk"><meta http-equiv="Pragma" content="no-cache"><meta http-equiv="cache-control" content="no-cache, no-store, must-revalidate"><link rel="stylesheet" href="/offshoot_assets/index.34c417fd1d63.css"><title>Internet Archive: Digital Library of Free &amp; Borrowable Books, Movies, Music &amp; Wayback Machine</title><script src="https://polyfill.archive.org/v3/polyfill.min.js?features=fetch%2CIntersectionObserver%2CResizeObserver%2CglobalThis%2CElement.prototype.getAttributeNames%2CString.prototype.startsWith%2CArray.prototype.flat%2CURL%2CURLSearchParams"></script><script src="/offshoot
<html><body><h1>analytics.archive.org</h1></body></html>
/* Polyfill service v3.109.0 * Disable minification (remove `.min` from URL path) for more info */(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError(Object.prototype.toString.call(e)+" is not coercible to Object.");return e}function SameValueNonNumber(e,n){return e===n}function ToBoolean(o){return Boolean(o)}function ToObject(e){if(null===e||e===undefined)throw TypeErro
<!DOCTYPE html><html lang="en"><head><base href="/"><meta charset="utf-8"><link rel="icon" href="/offshoot_assets/favicon.ico"><link rel="preconnect" href="https://analytics.archive.org"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="google-site-verification" content="Q2YSouphkkgHkFNP7FgAkc4TmBs1Gmag3uGNndb53B8"><meta name="google-site-verification" content="bpjKvUvsX0lxfmjg19TLblckWkDpnptZEYsBntApxUk"><meta http-equiv="Pragma" content="no-cache"><meta http-equiv="cache-control" content="no-cache, no-store, must-revalidate"><link rel="stylesheet" href="/offshoot_assets/index.34c417fd1d63.css"><title>Internet Archive: Digital Library of Free &amp; Borrowable Books, Movies, Music &amp; Wayback Machine</title><script src="https://polyfill.archive.org/v3/polyfill.min.js?features=fetch%2CIntersectionObserver%2CResizeObserver%2CglobalThis%2CElement.prototype.getAttributeNames%2CString.prototype.startsWith%2CArray.prototype.flat%2CURL%2CURLSearchParams"></script><script src="/offshoot
<html><body><h1>analytics.archive.org</h1></body></html>
/* Polyfill service v3.109.0 * Disable minification (remove `.min` from URL path) for more info */(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError(Object.prototype.toString.call(e)+" is not coercible to Object.");return e}function SameValueNonNumber(e,n){return e===n}function ToBoolean(o){return Boolean(o)}function ToObject(e){if(null===e||e===undefined)throw TypeErro
<!DOCTYPE html><html lang="en"><head><base href="/"><meta charset="utf-8"><link rel="icon" href="/offshoot_assets/favicon.ico"><link rel="preconnect" href="https://analytics.archive.org"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="google-site-verification" content="Q2YSouphkkgHkFNP7FgAkc4TmBs1Gmag3uGNndb53B8"><meta name="google-site-verification" content="bpjKvUvsX0lxfmjg19TLblckWkDpnptZEYsBntApxUk"><meta http-equiv="Pragma" content="no-cache"><meta http-equiv="cache-control" content="no-cache, no-store, must-revalidate"><link rel="stylesheet" href="/offshoot_assets/index.34c417fd1d63.css"><title>Internet Archive: Digital Library of Free &amp; Borrowable Books, Movies, Music &amp; Wayback Machine</title><script src="https://polyfill.archive.org/v3/polyfill.min.js?features=fetch%2CIntersectionObserver%2CResizeObserver%2CglobalThis%2CElement.prototype.getAttributeNames%2CString.prototype.startsWith%2CArray.prototype.flat%2CURL%2CURLSearchParams"></script><script src="/offshoot
<html><body><h1>analytics.archive.org</h1></body></html>
/* Polyfill service v3.109.0 * Disable minification (remove `.min` from URL path) for more info */(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError(Object.prototype.toString.call(e)+" is not coercible to Object.");return e}function SameValueNonNumber(e,n){return e===n}function ToBoolean(o){return Boolean(o)}function ToObject(e){if(null===e||e===undefined)throw TypeErro
<!doctype html><html lang="en-US">  <head>    <meta data-rh="" charSet="utf-8"/><meta data-rh="" name="viewport" content="width=device-width, initial-scale=1"/><meta data-rh="" name="msapplication-TileColor" content="#001936"/><meta data-rh="" name="msapplication-TileImage" content="https://assets.tumblr.com/pop/manifest/mstile-150x150-b040e390.png"/><link data-rh="" rel="shortcut icon" href="https://assets.tumblr.com/pop/manifest/favicon-0e3d244a.ico" type="image/x-icon"/><link data-rh="" rel="icon" href="https://assets.tumblr.com/pop/manifest/favicon-cfddd25f.svg" type="image/svg+xml" sizes="any"/><link data-rh="" rel="mask-icon" href="https://assets.tumblr.com/pop/manifest/safari-pinned-tab-ad5440dd.svg" color="#001935"/><link data-rh="" rel="apple-touch-icon" sizes="180x180" href="https://assets.tumblr.com/pop/manifest/apple-touch-icon-6d2aadd9.png"/><title data-rh="">Trending topics on Tumblr</title><meta data-rh="" name="description" content="Explore trending topics on Tumblr. See all of the GIFs, fan a
<svg fill="none" height="128" viewBox="0 0 128 128" width="128" xmlns="http://www.w3.org/2000/svg"><g clip-rule="evenodd" fill-rule="evenodd"><path d="m64.0979 0c-35.4341 0-64.0979 28.6839-64.0979 63.9551 0 35.2631 28.6638 64.0449 64.0979 64.0449 35.2383 0 63.9021-28.7818 63.9021-64.0449 0-35.459-28.6638-63.9551-63.9021-63.9551z" fill="#001935"/><path d="m69.0573 78.3786c0 5.7711 2.9121 7.771 7.5453 7.771h6.5745v14.6684h-12.4476c-11.2159 0-19.5687-5.771-19.5687-19.5824v-22.1047h-10.1799v-11.9748c11.2077-2.9141 15.898-12.5624 16.4364-20.9292h11.64v18.9865h13.5815v13.9175h-13.5815z" fill="#fff"/></g></svg>
<!doctype html><html lang="en-US">  <head>    <meta data-rh="" charSet="utf-8"/><meta data-rh="" name="viewport" content="width=device-width, initial-scale=1"/><meta data-rh="" name="parsely-title" content="Trending topics on Tumblr"/><meta data-rh="" name="parsely-link" content="https://www.tumblr.com/explore/trending"/><meta data-rh="" name="parsely-type" content="index"/><meta data-rh="" name="parsely-image-url"/><meta data-rh="" name="parsely-section" content="Explore"/><meta data-rh="" name="parsely-tags"/><meta data-rh="" name="parsely-author"/><meta data-rh="" name="parsely-pub-date"/><meta data-rh="" name="msapplication-TileColor" content="#001936"/><meta data-rh="" name="msapplication-TileImage" content="https://assets.tumblr.com/pop/manifest/mstile-150x150-b040e390.png"/><link data-rh="" rel="shortcut icon" href="https://assets.tumblr.com/pop/manifest/favicon-0e3d244a.ico" type="image/x-icon"/><link data-rh="" rel="icon" href="https://assets.tumblr.com/pop/manifest/favicon-cfddd25f.svg" type="image/
<!DOCTYPE html><html lang="en"><head><base href="/"><meta charset="utf-8"><link rel="icon" href="/offshoot_assets/favicon.ico"><link rel="preconnect" href="https://analytics.archive.org"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="google-site-verification" content="Q2YSouphkkgHkFNP7FgAkc4TmBs1Gmag3uGNndb53B8"><meta name="google-site-verification" content="bpjKvUvsX0lxfmjg19TLblckWkDpnptZEYsBntApxUk"><meta http-equiv="Pragma" content="no-cache"><meta http-equiv="cache-control" content="no-cache, no-store, must-revalidate"><link rel="stylesheet" href="/offshoot_assets/index.34c417fd1d63.css"><title>Internet Archive: Digital Library of Free &amp; Borrowable Books, Movies, Music &amp; Wayback Machine</title><script src="https://polyfill.archive.org/v3/polyfill.min.js?features=fetch%2CIntersectionObserver%2CResizeObserver%2CglobalThis%2CElement.prototype.getAttributeNames%2CString.prototype.startsWith%2CArray.prototype.flat%2CURL%2CURLSearchParams"></script><script src="/offshoot
<html><body><h1>analytics.archive.org</h1></body></html>
/* Polyfill service v3.109.0 * Disable minification (remove `.min` from URL path) for more info */(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError(Object.prototype.toString.call(e)+" is not coercible to Object.");return e}function SameValueNonNumber(e,n){return e===n}function ToBoolean(o){return Boolean(o)}function ToObject(e){if(null===e||e===undefined)throw TypeErro
<!DOCTYPE html><!--[if lt IE 7 ]> <html class="ie6 " lang="en"> <![endif]-->  <!--[if IE 7 ]> <html class="ie7 " lang="en"> <![endif]-->  <!--[if IE 8 ]> <html class="ie8 " lang="en"> <![endif]-->  <!--[if IE 9 ]> <html class="ie9 " lang="en"> <![endif]-->  <!--[if IE 10 ]> <html class="ie10 " lang="en"> <![endif]--> <!--[if !IE]><!--> <html class="notIE" lang="en"> <!--<![endif]-->  <head>        <title>Courses &raquo;  Academics  | Boston University</title>        <!--meta cluster-->    <meta charset="utf-8">    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /><script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:[]}};(window.NREUM||(NREUM={})).loader_config={xpid:"UgUAV1RQGwQCVVNXAwY=",licenseKey:"b19c58809e",applicationID:"7212547"};;/*! For license information please see nr-loader-full-1.256.0.min.js.LICENSE.txt */(()=>{var e,t,r={234:(e,t,r)=>{"use strict";r.d(t,{P_:()=>m,Mt:()=>b,C5:()=>s,DL:()=>w,OP:()=>D,lF:()=>P,Yu:()=>E,Dg:()=>