Commit efb8656e authored by Tejas Thakur Singh's avatar Tejas Thakur Singh Committed by tejassng
Browse files

crawler can: access webpages + add all links from a webpage to url queue

parent cf35eb49
Loading
Loading
Loading
Loading
+49 −8
Original line number Diff line number Diff line
package org.example;

import org.jsoup.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.helper.Validate;


import java.util.HashSet;
import java.util.LinkedList;
import java.io.BufferedReader;
@@ -8,11 +15,11 @@ import java.io.FileReader;
import java.io.IOException;

import java.io.File;
public class crawler {

public class crawler {
    public static void main(String[] args) throws IOException{
        //Read all lines from a given file into the queue
        System.out.println(new File(".").getAbsolutePath());
        //System.out.println(new File(".").getAbsolutePath());

        FileReader f_read = new FileReader("crawler_test_file.txt");
        BufferedReader buf_read = new BufferedReader(f_read);
@@ -34,25 +41,59 @@ public class crawler {
    }
    public void crawl(int num_sites) {
        //dequeue current website and add it to visited
       // if (num_sites < 2 && !url_queue.isEmpty()) {
            String cur_site = url_queue.remove();
        visited_urls.add(cur_site);

            //connect to webpage
            Document web_data = get_web_data(cur_site);
        /*
            - browse through URLS using jsoup
            - properly interpret robots.txt
            - compress + store text metadata
        */

        while ((!url_queue.isEmpty())) {
            crawl(++num_sites);
            if (web_data != null) {
                Elements links = web_data.select("a[href]");
                for (Element link : links) {
                    String link_url = link.attr("abs:href");
                    //System.out.println(link_url);
                    if (!visited_urls.contains(link_url)) {
                        url_queue.add(link_url);
                    }
                }
            }

            //crawl(++num_sites);
        //}
    }
    private Document get_web_data(String url) {
        try {
            //use execute() in order to receive a response object -> allows status code checking
            Connection.Response req_response = Jsoup.connect(url)
                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
                            .referrer("http.//www.google.com")
                            .execute();
            Document web_data = req_response.parse();

            //ensure that an OK is received
            if(req_response.statusCode() == 200) {
                visited_urls.add(url);
                return web_data;
            }
            return null;
        }
        catch(IOException err) {
            return null;
        }
    }
    public void get_visited() {
        System.out.println("All of the visited websites:");
        for(String url: visited_urls) {
            System.out.println(url);
        }
//        System.out.println("URL Queue at the end");
//        for(String url: url_queue) {
//            System.out.println(url);
//        }
    }
    //members
    private final LinkedList<String> url_queue;

crawler_test_file.txt

0 → 100644
+23 −0
Original line number Diff line number Diff line
https://en.wikipedia.org/wiki/Web_crawler
https://link.springer.com/book/10.1007/978-3-540-46332-0
http://www.youtube.com
http://www.facebook.com
http://www.yahoo.com
http://www.amazon.com
http://www.wikipedia.org
http://www.twitter.com
http://www.live.com
http://www.bing.com
http://www.instagram.com
http://www.linkedin.com
http://www.msn.com
http://www.vk.com
http://www.hao123.com
http://www.reddit.com
http://www.ebay.com
http://www.t.co
http://www.tmall.com
http://www.sohu.com
http://www.pinterest.com
http://www.netflix.com
http://www.microsoft.com
 No newline at end of file