Loading .gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -4,3 +4,4 @@ target/ .vscode/ .idea/ *.json .DS_Store No newline at end of file README.md +2 −0 Original line number Diff line number Diff line Loading @@ -193,6 +193,8 @@ Implemented GUI text highlighter for checker Implemented GUI for correcter Found and generated text corpses for german, italian and portuguese ### Tejas Singh Worked on base functionality of crawler: implemented Jsoup, basic data structures (such as the URL queue), and CLI (for use with files). Loading generateCorpus.py 0 → 100644 +18 −0 Original line number Diff line number Diff line import re def remove_numbers_from_text(text): # Use regular expression to remove numbers followed by a tab cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/italian.txt', 'a') as file: file.writelines(cleaned_lines) Loading
.gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -4,3 +4,4 @@ target/ .vscode/ .idea/ *.json .DS_Store No newline at end of file
README.md +2 −0 Original line number Diff line number Diff line Loading @@ -193,6 +193,8 @@ Implemented GUI text highlighter for checker Implemented GUI for correcter Found and generated text corpses for german, italian and portuguese ### Tejas Singh Worked on base functionality of crawler: implemented Jsoup, basic data structures (such as the URL queue), and CLI (for use with files). Loading
generateCorpus.py 0 → 100644 +18 −0 Original line number Diff line number Diff line import re def remove_numbers_from_text(text): # Use regular expression to remove numbers followed by a tab cleaned_text = re.sub(r'^\d+\t', '', text) return cleaned_text # Assume you read the content of the file into a variable called `lines` # For example, you can read the file like this: with open('/Users/manuelsegimonplana/Desktop/ita_news_2023_1M-sentences.txt', 'r') as file: lines = file.readlines() # Now apply the function to each line cleaned_lines = [remove_numbers_from_text(line) for line in lines] # Optionally, you can write the cleaned lines back to a file with open('src/main/java/resources/italian.txt', 'a') as file: file.writelines(cleaned_lines)