Commit 8f249dd4 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Shrink the corpuses

parent 5186af6c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -193,7 +193,7 @@ Implemented GUI text highlighter for checker

Implemented GUI for correcter

Found and generated text corpses for german, italian and portuguese
Found and generated text corpses for german, italian and portuguese (From: https://wortschatz.uni-leipzig.de/en/download/German)

### Tejas Singh
Worked on base functionality of crawler: implemented Jsoup, basic data structures (such as the URL queue), and CLI (for use with files).
+18 −7
Original line number Diff line number Diff line
@@ -5,14 +5,25 @@ def remove_numbers_from_text(text):
    cleaned_text = re.sub(r'^\d+\t', '', text)
    return cleaned_text

# Assume you read the content of the file into a variable called `lines`
# For example, you can read the file like this:
with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file:
# # Assume you read the content of the file into a variable called `lines`
# # For example, you can read the file like this:
# with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file:
#     lines = file.readlines()

# # Now apply the function to each line
# cleaned_lines = [remove_numbers_from_text(line) for line in lines]

# # Optionally, you can write the cleaned lines back to a file
# with open('src/main/java/resources/italian.txt', 'a') as file:
#     file.writelines(cleaned_lines)

# Remove at random 1.5M of the lines in the file
import random
with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file:
    lines = file.readlines()

# Now apply the function to each line
cleaned_lines = [remove_numbers_from_text(line) for line in lines]
random.shuffle(lines)

# Optionally, you can write the cleaned lines back to a file
with open('src/main/java/resources/italian.txt', 'a') as file:
    file.writelines(cleaned_lines)
with open('italianSmall.txt', 'w') as file:
    file.writelines(lines[:500000]) # Last working number: 500000
+35 −25
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@

    <build>
        <plugins>
            <!-- Configuration for packaging the JAR with a Main-Class manifest entry -->
            <plugin>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.2.0</version>
@@ -23,11 +24,20 @@
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <mainClass>edu.bu.LanguageCorrection.MainApp</mainClass>  <!-- Replace with your main class -->
                            <mainClass>edu.bu.LanguageCorrection.MainApp</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
            <!-- Plugin to execute the main class using Maven -->
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>exec-maven-plugin</artifactId>
                <version>3.2.0</version>
                <configuration>
                    <mainClass>edu.bu.LanguageCorrection.MainApp</mainClass>
                </configuration>
            </plugin>
        </plugins>
    </build>

@@ -36,6 +46,7 @@
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version>
            <scope>compile</scope>
        </dependency>
    
        <dependency>
@@ -55,12 +66,11 @@
                        <artifactId>maven-surefire-plugin</artifactId>
                        <version>3.0.0-M5</version>
                        <configuration>
                        <argLine>-Xmx100g</argLine>
                            <argLine>-Xmx5g</argLine>
                        </configuration>
                    </plugin>
                </plugins>
            </build>
        </profile>
    </profiles>

</project>
+1 −1
Original line number Diff line number Diff line
@@ -41,7 +41,7 @@ public class MainApp extends JFrame {

        // Check if metadata file exists
        if (!Files.exists(Paths.get("metadata.ser"))) {
            String[] languages = {"en", "es", "pt", "it"};
            String[] languages = {"en", "gr", "pt", "it"};
            String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of. \n(If you want to build from scratch just click Cancel)", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
            if (selectedLanguage != null) {
                crawler webCrawler = new crawler();
+3 −3
Original line number Diff line number Diff line
@@ -147,11 +147,11 @@ public class crawler {
        if (language.equals("en")) {
            corpus = "brown.txt";
        } else if (language.equals("gr")) {
            corpus = "german.txt";
            corpus = "germanSmall.txt";
        } else if (language.equals("it")) {
            corpus = "italian.txt";
            corpus = "italianSmall.txt";
        } else if (language.equals("pt")) {
            corpus = "portuguese.txt";
            corpus = "portugueseSmall.txt";
        } else {
            System.err.println("Unsupported language: " + language);
            return;
Loading