Commit 23ab3350 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Generate english corpus

parent 8f249dd4
Loading
Loading
Loading
Loading
+16 −16
Original line number Diff line number Diff line
@@ -5,25 +5,25 @@ def remove_numbers_from_text(text):
    cleaned_text = re.sub(r'^\d+\t', '', text)
    return cleaned_text

# # Assume you read the content of the file into a variable called `lines`
# # For example, you can read the file like this:
# with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file:
#     lines = file.readlines()
# Assume you read the content of the file into a variable called `lines`
# For example, you can read the file like this:
with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file:
    lines = file.readlines()

# # Now apply the function to each line
# cleaned_lines = [remove_numbers_from_text(line) for line in lines]
# Now apply the function to each line
cleaned_lines = [remove_numbers_from_text(line) for line in lines]

# # Optionally, you can write the cleaned lines back to a file
# with open('src/main/java/resources/italian.txt', 'a') as file:
#     file.writelines(cleaned_lines)
# Optionally, you can write the cleaned lines back to a file
with open('src/main/java/resources/english.txt', 'a') as file:
    file.writelines(cleaned_lines)

# Remove at random 1.5M of the lines in the file
import random
with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file:
    lines = file.readlines()
# import random
# with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file:
#     lines = file.readlines()

random.shuffle(lines)
# random.shuffle(lines)

# Optionally, you can write the cleaned lines back to a file
with open('italianSmall.txt', 'w') as file:
    file.writelines(lines[:500000]) # Last working number: 500000
# # Optionally, you can write the cleaned lines back to a file
# with open('italianSmall.txt', 'w') as file:
#     file.writelines(lines[:500000]) # Last working number: 500000