Loading generateCorpus.py +16 −16 Original line number Diff line number Diff line Loading @@ -5,25 +5,25 @@ def remove_numbers_from_text(text): cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # # Assume you read the content of the file into a variable called `lines` # # For example, you can read the file like this: # with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: # lines = file.readlines() # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # # Now apply the function to each line # cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/english.txt', 'a') as file: file.writelines(cleaned_lines) # # Optionally, you can write the cleaned lines back to a file # with open('src/main/java/resources/english.txt', 'a') as file: # file.writelines(cleaned_lines) # Remove at random 1.5M of the lines in the file # import random # with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: # lines = file.readlines() import random with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/english.txt', 'r') as file: lines = file.readlines() # random.shuffle(lines) random.shuffle(lines) # # Optionally, you can write the cleaned lines back to a file # with open('italianSmall.txt', 'w') as file: # file.writelines(lines[:500000]) # Last working number: 500000 # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/brown.txt', 'a') as file: file.writelines(lines[:450000]) # Last working number: 500000 src/main/java/resources/brown.txt +0 −0 File changed.Preview suppressed by a .gitattributes entry or the file's encoding is unsupported. View original file View changed file Loading
generateCorpus.py +16 −16 Original line number Diff line number Diff line Loading @@ -5,25 +5,25 @@ def remove_numbers_from_text(text): cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # # Assume you read the content of the file into a variable called `lines` # # For example, you can read the file like this: # with open('/Users/manuelsegimonplana/Desktop/eng_news_2023_1M-sentences.txt', 'r') as file: # lines = file.readlines() # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # # Now apply the function to each line # cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/english.txt', 'a') as file: file.writelines(cleaned_lines) # # Optionally, you can write the cleaned lines back to a file # with open('src/main/java/resources/english.txt', 'a') as file: # file.writelines(cleaned_lines) # Remove at random 1.5M of the lines in the file # import random # with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/italian.txt', 'r') as file: # lines = file.readlines() import random with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/english.txt', 'r') as file: lines = file.readlines() # random.shuffle(lines) random.shuffle(lines) # # Optionally, you can write the cleaned lines back to a file # with open('italianSmall.txt', 'w') as file: # file.writelines(lines[:500000]) # Last working number: 500000 # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/brown.txt', 'a') as file: file.writelines(lines[:450000]) # Last working number: 500000
src/main/java/resources/brown.txt +0 −0 File changed.Preview suppressed by a .gitattributes entry or the file's encoding is unsupported. View original file View changed file