Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to store a document in a separate txt file instead of a single txt file containing multiple documents #328

Open
hxy-62 opened this issue Mar 12, 2024 · 1 comment

Comments

@hxy-62
Copy link

hxy-62 commented Mar 12, 2024

The default execution result is that a txt file contains multiple documents. Now, I want a txt file to only contain one document. What should I do?

@sarda-devesh
Copy link

I wrote a simple script to do this:

import os
import shutil

# Prefix to only include certain files
FILE_NAME_PREFIX = "wiki_" 

# The path to the directory containing the extracted results
START_DIR = "/home/ubuntu/datasets/text" 

# The directory to save the results per document 
DOCS_SAVE_PATH = "/home/ubuntu/datasets/utf8_wikipedia_data"

def get_all_files():
    all_files = []
    for dir_name in os.listdir(START_DIR):
        dir_path = os.path.join(START_DIR, dir_name)
        for file_name in os.listdir(dir_path):
            if FILE_NAME_PREFIX not in file_name or file_name[0] == '.':
                continue

            # Record the file_path
            file_path = os.path.join(dir_path, file_name)
            all_files.append(file_path)

    return all_files

RECORD_END_MARKER = "</doc>"
def extract_records_from_docs(doc_path):
    with open(doc_path, 'r') as reader:
        lines = reader.readlines()
    
    curr_idx = 0
    max_txt_len = 0
    while curr_idx < len(lines):
        # Get the current document id
        curr_doc_line = lines[curr_idx].strip()
        doc_line_parts = curr_doc_line.split(" ")
        doc_id_str = doc_line_parts[1].split("=")[1]
        doc_id = doc_id_str[1 : -1]
        
        # Get the current document lines
        doc_lines = []
        curr_idx += 1
        while RECORD_END_MARKER not in lines[curr_idx]:
            curr_line = lines[curr_idx].strip()
            if len(curr_line) > 0:
                doc_lines.append(curr_line)
            curr_idx += 1

        # Determine the text to write
        if len(doc_lines) > 1:
            doc_lines.pop(0)
            txt_to_write = "\n".join(doc_lines)
            max_txt_len = max(max_txt_len, len(txt_to_write))
            save_path = os.path.join(DOCS_SAVE_PATH, doc_id + ".txt")
            with open(save_path, 'w+') as writer:
                writer.write(txt_to_write)
        
        # Increment to the next record
        curr_idx += 1
    
    return max_txt_len

def main():
    # Create the save directory
    if os.path.exists(DOCS_SAVE_PATH):
        shutil.rmtree(DOCS_SAVE_PATH)
    os.makedirs(DOCS_SAVE_PATH, exist_ok = True)

    # Get the result per file
    all_files = get_all_files()
    max_overall_len = 0
    for file_path in all_files:
        max_len = extract_records_from_docs(file_path)
        print("Got max len of ", max_len, "for file", file_path)
        max_overall_len = max(max_overall_len, max_len)
    
    print("Got maximum txt length of", max_overall_len)

if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants