Commit 9f304279 authored by Moises Bensadon's avatar Moises Bensadon
Browse files

Merge branch '9-implement-perplexity-ngram-for-checker' into 'master'

Resolve "Implement perplexity + Ngram for Checker"

Closes #9

See merge request ec504/ec504_projects/group7!3
parents a6145abc 97efe9fb
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -3,3 +3,4 @@
target/
.vscode/
.idea/
*.json
 No newline at end of file

checker_test_file.txt

0 → 100644
+2 −0
Original line number Diff line number Diff line
This is unambiguous. This is word odd so choice. Sorry for the inconvenience.
Kenya, officially the Republic of Kenya
 No newline at end of file
+100 −22
Original line number Diff line number Diff line
https://en.wikipedia.org/wiki/Web_crawler
https://link.springer.com/book/10.1007/978-3-540-46332-0
http://www.youtube.com
http://www.facebook.com
http://www.yahoo.com
http://www.amazon.com
http://www.wikipedia.org
http://www.twitter.com
http://www.live.com
http://www.bing.com
http://www.instagram.com
http://www.linkedin.com
http://www.msn.com
http://www.vk.com
http://www.hao123.com
http://www.reddit.com
http://www.ebay.com
http://www.t.co
http://www.tmall.com
http://www.sohu.com
http://www.pinterest.com
http://www.netflix.com
http://www.microsoft.com
 No newline at end of file
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Alliteration
https://en.wikipedia.org/wiki/Pun
https://en.wikipedia.org/wiki/Encyclop%C3%A6dia_Britannica
https://en.wikipedia.org/wiki/Cosmic_latte
https://en.wikipedia.org/wiki/Death_from_laughter
https://en.wikipedia.org/wiki/Bir_Tawil
https://en.wikipedia.org/wiki/M%C3%A4rket
https://en.wikipedia.org/wiki/Phineas_Gage
https://en.wikipedia.org/wiki/Snow_in_Florida
https://en.wikipedia.org/wiki/Cult_following
https://en.wikipedia.org/wiki/Ampelm%C3%A4nnchen
https://en.wikipedia.org/wiki/All_your_base_are_belong_to_us
https://en.wikipedia.org/wiki/Hoax
https://en.wikipedia.org/wiki/Sokal_affair
https://en.wikipedia.org/wiki/Mary_Toft
https://en.wikipedia.org/wiki/Breast-shaped_hill
https://en.wikipedia.org/wiki/Folly
https://en.wikipedia.org/wiki/Manchester
https://en.wikipedia.org/wiki/Eiffel_Tower_replicas_and_derivatives
https://en.wikipedia.org/wiki/Gravity_hill
https://en.wikipedia.org/wiki/List_of_cities_claimed_to_be_built_on_seven_hills
https://en.wikipedia.org/wiki/Seven_hills_of_Rome
https://en.wikipedia.org/wiki/List_of_micronations
https://en.wikipedia.org/wiki/List_of_tautological_place_names
https://en.wikipedia.org/wiki/Truism
https://en.wikipedia.org/wiki/Pizza_farm
https://en.wikipedia.org/wiki/Recursive_islands_and_lakes
https://en.wikipedia.org/wiki/Rocket_garden
https://en.wikipedia.org/wiki/Spite_house
https://en.wikipedia.org/wiki/Valeriepieris_circle
https://en.wikipedia.org/wiki/Lake_Nyos
https://en.wikipedia.org/wiki/Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera
https://en.wikipedia.org/wiki/Senegal
https://en.wikipedia.org/wiki/Akon_City
https://en.wikipedia.org/wiki/Akon
https://en.wikipedia.org/wiki/Cryptocurrency
https://en.wikipedia.org/wiki/Egypt
https://en.wikipedia.org/wiki/Sudan
https://en.wikipedia.org/wiki/Blue_Desert
https://en.wikipedia.org/wiki/Egypt%E2%80%93Israel_peace_treaty
https://en.wikipedia.org/wiki/United_Nations
https://en.wikipedia.org/wiki/Sinai_Desert
https://en.wikipedia.org/wiki/South_Africa
https://en.wikipedia.org/wiki/Boulders_Beach
https://en.wikipedia.org/wiki/Democratic_Republic_of_the_Congo
https://en.wikipedia.org/wiki/Congo_Pedicle
https://en.wikipedia.org/wiki/Ethiopia
https://en.wikipedia.org/wiki/Dallol_(hydrothermal_system)
https://en.wikipedia.org/wiki/Dallol_(ghost_town)
https://en.wikipedia.org/wiki/Gaet%27ale_Pond
https://en.wikipedia.org/wiki/Saint_Helena
https://en.wikipedia.org/wiki/Jacob%27s_Ladder_(Saint_Helena)
https://en.wikipedia.org/wiki/Nigeria
https://en.wikipedia.org/wiki/Kalakuta_Republic
https://en.wikipedia.org/wiki/Fela_Kuti
https://en.wikipedia.org/wiki/Supreme_Military_Council_of_Nigeria_(1966%E2%80%931979)
https://en.wikipedia.org/wiki/Cameroon
https://en.wikipedia.org/wiki/Lake_Nyos_disaster
https://en.wikipedia.org/wiki/Lake_Monoun
https://en.wikipedia.org/wiki/Mauritania
https://en.wikipedia.org/wiki/Mauritania_Railway
https://en.wikipedia.org/wiki/Nouadhibou
https://en.wikipedia.org/wiki/Iron_ore
https://en.wikipedia.org/wiki/Gabon
https://en.wikipedia.org/wiki/Oklo_Mine
https://en.wikipedia.org/wiki/Angola
https://en.wikipedia.org/wiki/Pal%C3%A1cio_de_Ferro
https://en.wikipedia.org/wiki/Luanda
https://en.wikipedia.org/wiki/Gustave_Eiffel
https://en.wikipedia.org/wiki/Eiffel_Tower
https://en.wikipedia.org/wiki/Spain
https://en.wikipedia.org/wiki/Morocco
https://en.wikipedia.org/wiki/Tombolo
https://en.wikipedia.org/wiki/2012_Pe%C3%B1%C3%B3n_de_V%C3%A9lez_de_la_Gomera_incident
https://en.wikipedia.org/wiki/Republic_of_Benin_(1967)
https://en.wikipedia.org/wiki/Yemen
https://en.wikipedia.org/wiki/Socotra
https://en.wikipedia.org/wiki/Dracaena_cinnabari
https://en.wikipedia.org/wiki/Dendrosicyos
https://en.wikipedia.org/wiki/Ghana
https://en.wikipedia.org/wiki/La_Tante_DC10_Restaurant
https://en.wikipedia.org/wiki/McDonnell_Douglas_DC-10
https://en.wikipedia.org/wiki/Accra
https://en.wikipedia.org/wiki/French_Southern_and_Antarctic_Lands
https://en.wikipedia.org/wiki/Tromelin_Island
https://en.wikipedia.org/wiki/The_Owl_House_(museum)
https://en.wikipedia.org/wiki/The_Owl_House
https://en.wikipedia.org/wiki/Kenya
https://en.wikipedia.org/wiki/Umoja,_Kenya
https://en.wikipedia.org/wiki/Violence_against_women
https://en.wikipedia.org/wiki/Samburu_people
https://en.wikipedia.org/wiki/Blood_Falls
https://en.wikipedia.org/wiki/Ross_Dependency
https://en.wikipedia.org/wiki/Iron_oxide
https://en.wikipedia.org/wiki/Heard_Island_and_McDonald_Islands
https://en.wikipedia.org/wiki/Mawson_Peak
https://en.wikipedia.org/wiki/Australia
https://en.wikipedia.org/wiki/McMurdo_Dry_Valleys
https://en.wikipedia.org/wiki/Don_Juan_Pond
 No newline at end of file

getLinks.py

0 → 100644
+23 −0
Original line number Diff line number Diff line
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links = []
    for link in soup.find_all('a', href=True):
        if '/wiki/' in link['href'] and ':' not in link['href']:
            full_link = f"https://en.wikipedia.org{link['href']}"
            if full_link not in links:
                links.append(full_link)
    
    return links

# Example: Fetching links from the Unusual Articles page
unusual_articles_url = 'https://en.wikipedia.org/wiki/Wikipedia:Unusual_articles'
unusual_links = get_wikipedia_article_links(unusual_articles_url)

# Print the first 10 links as a sample
for link in unusual_links[:100]:
    print(link)
+0 −66
Original line number Diff line number Diff line
package edu.bu.LanguageCorrection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AnomalyDetector {

    private static final Set<String> commonWords = new HashSet<>();
    private static final int AVERAGE_WORD_LENGTH = 5; // TODO: Change later

    static { // simple list of common words for demonstration purposes
        commonWords.add("the");
        commonWords.add("be");
        commonWords.add("to");
        commonWords.add("of");
        commonWords.add("and"); // we can add more later
    }

    public Map<String, Integer> analyzeSentences(List<String> sentences) {
        Map<String, Integer> sentenceScores = new HashMap<>();
        for (String sentence : sentences) {
            int score = 0;
            // length variance
            score += Math.abs(sentence.length() - AVERAGE_WORD_LENGTH * 10); // Assuming an average sentence length

            // word rarity
            String[] words = sentence.split("\\s+");
            for (String word : words) {
                if (!commonWords.contains(word.toLowerCase())) {
                    score += 10; // Increment score for each UNCOMMON word
                }
            }

            sentenceScores.put(sentence, Math.min(score, 100)); // Limit to 100 (normalize)
        }
        return sentenceScores;
    }

    public Map<String, Integer> analyzePhrases(List<String> sentences) {
        Map<String, Integer> phraseScores = new HashMap<>();

        for (String sentence : sentences) {
            List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

            for (String phrase : phrases) {
                int score = 0;
                // Score length variance like above
                if (phrase.length() < AVERAGE_WORD_LENGTH || phrase.length() > AVERAGE_WORD_LENGTH * 3) {
                    score += 20;
                }
                // Word rarity
                String[] words = phrase.split("\\s+");
                for (String word : words) {
                    if (!commonWords.contains(word.toLowerCase())) {
                        score += 5;
                    }
                }
                phraseScores.put(phrase, Math.min(score, 100)); // Normalize
            }
        }

        return phraseScores;
    }
}
Loading