Commit 25931dac authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

work in progress

parent 66ce634e
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
Sorry for the terrible inconvenience but this site is still under super development. be right back.
Sorry for Apple Dog Hello World the terrible so terrible inconvenience under super development.
 No newline at end of file
This is unambiguous. This is word odd so choice. Sorry for the inconvenience.
Kenya, officially the Republic of Kenya
 No newline at end of file
+100 −1
Original line number Diff line number Diff line
https://en.wikipedia.org/wiki/Web_crawler
https://en.wikipedia.org/wiki/Wikipedia:Vital_articles
 No newline at end of file
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Alliteration
https://en.wikipedia.org/wiki/Pun
https://en.wikipedia.org/wiki/Encyclop%C3%A6dia_Britannica
https://en.wikipedia.org/wiki/Cosmic_latte
https://en.wikipedia.org/wiki/Death_from_laughter
https://en.wikipedia.org/wiki/Bir_Tawil
https://en.wikipedia.org/wiki/M%C3%A4rket
https://en.wikipedia.org/wiki/Phineas_Gage
https://en.wikipedia.org/wiki/Snow_in_Florida
https://en.wikipedia.org/wiki/Cult_following
https://en.wikipedia.org/wiki/Ampelm%C3%A4nnchen
https://en.wikipedia.org/wiki/All_your_base_are_belong_to_us
https://en.wikipedia.org/wiki/Hoax
https://en.wikipedia.org/wiki/Sokal_affair
https://en.wikipedia.org/wiki/Mary_Toft
https://en.wikipedia.org/wiki/Breast-shaped_hill
https://en.wikipedia.org/wiki/Folly
https://en.wikipedia.org/wiki/Manchester
https://en.wikipedia.org/wiki/Eiffel_Tower_replicas_and_derivatives
https://en.wikipedia.org/wiki/Gravity_hill
https://en.wikipedia.org/wiki/List_of_cities_claimed_to_be_built_on_seven_hills
https://en.wikipedia.org/wiki/Seven_hills_of_Rome
https://en.wikipedia.org/wiki/List_of_micronations
https://en.wikipedia.org/wiki/List_of_tautological_place_names
https://en.wikipedia.org/wiki/Truism
https://en.wikipedia.org/wiki/Pizza_farm
https://en.wikipedia.org/wiki/Recursive_islands_and_lakes
https://en.wikipedia.org/wiki/Rocket_garden
https://en.wikipedia.org/wiki/Spite_house
https://en.wikipedia.org/wiki/Valeriepieris_circle
https://en.wikipedia.org/wiki/Lake_Nyos
https://en.wikipedia.org/wiki/Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera
https://en.wikipedia.org/wiki/Senegal
https://en.wikipedia.org/wiki/Akon_City
https://en.wikipedia.org/wiki/Akon
https://en.wikipedia.org/wiki/Cryptocurrency
https://en.wikipedia.org/wiki/Egypt
https://en.wikipedia.org/wiki/Sudan
https://en.wikipedia.org/wiki/Blue_Desert
https://en.wikipedia.org/wiki/Egypt%E2%80%93Israel_peace_treaty
https://en.wikipedia.org/wiki/United_Nations
https://en.wikipedia.org/wiki/Sinai_Desert
https://en.wikipedia.org/wiki/South_Africa
https://en.wikipedia.org/wiki/Boulders_Beach
https://en.wikipedia.org/wiki/Democratic_Republic_of_the_Congo
https://en.wikipedia.org/wiki/Congo_Pedicle
https://en.wikipedia.org/wiki/Ethiopia
https://en.wikipedia.org/wiki/Dallol_(hydrothermal_system)
https://en.wikipedia.org/wiki/Dallol_(ghost_town)
https://en.wikipedia.org/wiki/Gaet%27ale_Pond
https://en.wikipedia.org/wiki/Saint_Helena
https://en.wikipedia.org/wiki/Jacob%27s_Ladder_(Saint_Helena)
https://en.wikipedia.org/wiki/Nigeria
https://en.wikipedia.org/wiki/Kalakuta_Republic
https://en.wikipedia.org/wiki/Fela_Kuti
https://en.wikipedia.org/wiki/Supreme_Military_Council_of_Nigeria_(1966%E2%80%931979)
https://en.wikipedia.org/wiki/Cameroon
https://en.wikipedia.org/wiki/Lake_Nyos_disaster
https://en.wikipedia.org/wiki/Lake_Monoun
https://en.wikipedia.org/wiki/Mauritania
https://en.wikipedia.org/wiki/Mauritania_Railway
https://en.wikipedia.org/wiki/Nouadhibou
https://en.wikipedia.org/wiki/Iron_ore
https://en.wikipedia.org/wiki/Gabon
https://en.wikipedia.org/wiki/Oklo_Mine
https://en.wikipedia.org/wiki/Angola
https://en.wikipedia.org/wiki/Pal%C3%A1cio_de_Ferro
https://en.wikipedia.org/wiki/Luanda
https://en.wikipedia.org/wiki/Gustave_Eiffel
https://en.wikipedia.org/wiki/Eiffel_Tower
https://en.wikipedia.org/wiki/Spain
https://en.wikipedia.org/wiki/Morocco
https://en.wikipedia.org/wiki/Tombolo
https://en.wikipedia.org/wiki/2012_Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera_incident
https://en.wikipedia.org/wiki/Republic_of_Benin_(1967)
https://en.wikipedia.org/wiki/Yemen
https://en.wikipedia.org/wiki/Socotra
https://en.wikipedia.org/wiki/Dracaena_cinnabari
https://en.wikipedia.org/wiki/Dendrosicyos
https://en.wikipedia.org/wiki/Ghana
https://en.wikipedia.org/wiki/La_Tante_DC10_Restaurant
https://en.wikipedia.org/wiki/McDonnell_Douglas_DC-10
https://en.wikipedia.org/wiki/Accra
https://en.wikipedia.org/wiki/French_Southern_and_Antarctic_Lands
https://en.wikipedia.org/wiki/Tromelin_Island
https://en.wikipedia.org/wiki/The_Owl_House_(museum)
https://en.wikipedia.org/wiki/The_Owl_House
https://en.wikipedia.org/wiki/Kenya
https://en.wikipedia.org/wiki/Umoja,_Kenya
https://en.wikipedia.org/wiki/Violence_against_women
https://en.wikipedia.org/wiki/Samburu_people
https://en.wikipedia.org/wiki/Blood_Falls
https://en.wikipedia.org/wiki/Ross_Dependency
https://en.wikipedia.org/wiki/Iron_oxide
https://en.wikipedia.org/wiki/Heard_Island_and_McDonald_Islands
https://en.wikipedia.org/wiki/Mawson_Peak
https://en.wikipedia.org/wiki/Australia
https://en.wikipedia.org/wiki/McMurdo_Dry_Valleys
https://en.wikipedia.org/wiki/Don_Juan_Pond
 No newline at end of file

getLinks.py

0 → 100644
+23 −0
Original line number Diff line number Diff line
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links = []
    for link in soup.find_all('a', href=True):
        if '/wiki/' in link['href'] and ':' not in link['href']:
            full_link = f"https://en.wikipedia.org{link['href']}"
            if full_link not in links:
                links.append(full_link)
    
    return links

# Example: Fetching links from the Unusual Articles page
unusual_articles_url = 'https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles'
unusual_links = get_wikipedia_article_links(unusual_articles_url)

# Print the first 10 links as a sample
for link in unusual_links[:100]:
    print(link)
+6 −4
Original line number Diff line number Diff line
@@ -41,7 +41,7 @@ public class crawler {
        }

        // Start crawling
        int crawlLimit = 1; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
        web_crawler.crawl(crawlLimit);
        
        // Print visited URLs
@@ -85,7 +85,9 @@ public class crawler {
                    Elements links = web_data.select("a[href]");
                    for (Element link : links) {
                        String link_url = link.attr("abs:href");
                        if (!link_url.isEmpty() && !visited_urls.contains(link_url)) {
                        System.out.println("Found link: " + link_url);
                        if (!link_url.isEmpty() && !visited_urls.contains(link_url.split("#")[0])) {
                            System.out.println("Adding link to queue: " + link_url);
                            url_queue.add(link_url);
                        }
                    }
@@ -122,13 +124,13 @@ public class crawler {

        // Break the page text into manageable chunks, considering sentences
        List<String> chunks = splitTextIntoChunks(web_data.text());
        int chunkCount = 0;
        // int chunkCount = 0;
        if (chunks.isEmpty()) {
            System.out.println("No text found on page.");
            return;
        }
        for (String chunk : chunks) {
            chunkCount++;
            // chunkCount++;
            // System.out.println("Current compressed size: "+ compressedData.length+". Processing chunk " + chunkCount + " of " + chunks.size());
            previousUncompressedData = uncompressedData.clone();
            extractWordUsage(chunk, wordUsage);