Loading generateCorpus.py +16 −16 Original line number Diff line number Diff line Loading @@ -5,25 +5,25 @@ def remove_numbers_from_text(text): cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # # Assume you read the content of the file into a variable called `lines` # # For example, you can read the file like this: # with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file: # lines = file.readlines() # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # # Now apply the function to each line # cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # # Optionally, you can write the cleaned lines back to a file # with open('src/main/java/resources/italian.txt', 'a') as file: # file.writelines(cleaned_lines) # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/english.txt', 'a') as file: file.writelines(cleaned_lines) # Remove at random 1.5M of the lines in the file import random with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: lines = file.readlines() # import random # with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: # lines = file.readlines() random.shuffle(lines) # random.shuffle(lines) # Optionally, you can write the cleaned lines back to a file with open('italianSmall.txt', 'w') as file: file.writelines(lines[:500000]) # Last working number: 500000 # # Optionally, you can write the cleaned lines back to a file # with open('italianSmall.txt', 'w') as file: # file.writelines(lines[:500000]) # Last working number: 500000 Loading
generateCorpus.py +16 −16 Original line number Diff line number Diff line Loading @@ -5,25 +5,25 @@ def remove_numbers_from_text(text): cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # # Assume you read the content of the file into a variable called `lines` # # For example, you can read the file like this: # with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file: # lines = file.readlines() # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # # Now apply the function to each line # cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # # Optionally, you can write the cleaned lines back to a file # with open('src/main/java/resources/italian.txt', 'a') as file: # file.writelines(cleaned_lines) # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/english.txt', 'a') as file: file.writelines(cleaned_lines) # Remove at random 1.5M of the lines in the file import random with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: lines = file.readlines() # import random # with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: # lines = file.readlines() random.shuffle(lines) # random.shuffle(lines) # Optionally, you can write the cleaned lines back to a file with open('italianSmall.txt', 'w') as file: file.writelines(lines[:500000]) # Last working number: 500000 # # Optionally, you can write the cleaned lines back to a file # with open('italianSmall.txt', 'w') as file: # file.writelines(lines[:500000]) # Last working number: 500000