Commit 3fc60724 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Merge branch '19-add-text-corpuses-for-other-languages' into 'master'

Resolve "Add text corpuses for other languages"

See merge request ec504/ec504_projects/group7!12
parents 8655c87d b090b380
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -4,3 +4,4 @@ target/
.vscode/
.idea/
*.json
.DS_Store
 No newline at end of file
+2 −0
Original line number Diff line number Diff line
@@ -193,6 +193,8 @@ Implemented GUI text highlighter for checker

Implemented GUI for correcter

Found and generated text corpses for german, italian and portuguese (From: https://wortschatz.uni-leipzig.de/en/download/German)

### Tejas Singh
Worked on base functionality of crawler: implemented Jsoup, basic data structures (such as the URL queue), and CLI (for use with files).

generateCorpus.py

0 → 100644
+29 −0
Original line number Diff line number Diff line
import re

def remove_numbers_from_text(text):
    # Use regular expression to remove numbers followed by a tab
    cleaned_text = re.sub(r'^\d+\t', '', text)
    return cleaned_text

# # Assume you read the content of the file into a variable called `lines`
# # For example, you can read the file like this:
# with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file:
#     lines = file.readlines()

# # Now apply the function to each line
# cleaned_lines = [remove_numbers_from_text(line) for line in lines]

# # Optionally, you can write the cleaned lines back to a file
# with open('src/main/java/resources/english.txt', 'a') as file:
#     file.writelines(cleaned_lines)

# Remove at random 1.5M of the lines in the file
import random
with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/english.txt', 'r') as file:
    lines = file.readlines()

random.shuffle(lines)

# Optionally, you can write the cleaned lines back to a file
with open('src/main/java/resources/brown.txt', 'a') as file:
    file.writelines(lines[:450000]) # Last working number: 500000
+51 −6
Original line number Diff line number Diff line
@@ -14,11 +14,39 @@
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <build>
        <plugins>
            <!-- Configuration for packaging the JAR with a Main-Class manifest entry -->
            <plugin>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.2.0</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <mainClass>edu.bu.LanguageCorrection.MainApp</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
            <!-- Plugin to execute the main class using Maven -->
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>exec-maven-plugin</artifactId>
                <version>3.2.0</version>
                <configuration>
                    <mainClass>edu.bu.LanguageCorrection.MainApp</mainClass>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <dependencies>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version>
            <scope>compile</scope>
        </dependency>
    
        <dependency>
@@ -28,4 +56,21 @@
        </dependency>
    </dependencies>

    <profiles>
        <profile>
            <id>large-memory</id>
            <build>
                <plugins>
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-surefire-plugin</artifactId>
                        <version>3.0.0-M5</version>
                        <configuration>
                            <argLine>-Xmx5g</argLine>
                        </configuration>
                    </plugin>
                </plugins>
            </build>
        </profile>
    </profiles>
</project>
+1 −1
Original line number Diff line number Diff line
@@ -41,7 +41,7 @@ public class MainApp extends JFrame {

        // Check if metadata file exists
        if (!Files.exists(Paths.get("metadata.ser"))) {
            String[] languages = {"en", "es", "pt", "it"};
            String[] languages = {"en", "gr", "pt", "it"};
            String selectedLanguage = (String) JOptionPane.showInputDialog(this, "Metadata file not found. Please choose a language to build off of. \n(If you want to build from scratch just click Cancel)", "Language Selection", JOptionPane.PLAIN_MESSAGE, null, languages, languages[0]);
            if (selectedLanguage != null) {
                crawler webCrawler = new crawler();
Loading