Commit d932a294 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Shorten corpuses to avoid heap overflow

parent f45686c8
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -22,8 +22,12 @@ import random
with open('/Users/manuelsegimonplana/Documents/Current Courses/Not Completed Homework/DS - Project/group7/src/main/java/resources/english.txt', 'r') as file:
    lines = file.readlines()

# Remove lines with less than 4 words
lines = [line for line in lines if len(line.split()) >= 4]

# Shuffle the lines
random.shuffle(lines)

# Optionally, you can write the cleaned lines back to a file
with open('src/main/java/resources/brown.txt', 'a') as file:
    file.writelines(lines[:450000]) # Last working number: 500000
    file.writelines(lines[:10000])
+0 −0

File changed.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+0 −0

File changed.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+0 −0

File changed.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+0 −0

File changed.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.