Skip to content

Commit

Permalink
fix BookCorpus downloading bug (PaddlePaddle#399)
Browse files Browse the repository at this point in the history
* fix BookCorpus downloading bug
  • Loading branch information
Steffy-zxf authored May 17, 2021
1 parent abeed31 commit c2d8584
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
4 changes: 2 additions & 2 deletions examples/language_model/pretraining_data_prepare/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def do_text_sharding(model_name, formatted_files, output_dir, n_train_shards,
segmenter = ChineseSegmenter()

sharding_output_name_prefix = os.path.join(sharding_path, "sharding")
sharding = Sharding([formatted_files], sharding_output_name_prefix,
sharding = Sharding(formatted_files, sharding_output_name_prefix,
n_train_shards, n_test_shards, fraction_test_set)
sharding.load_articles()
logger.info("Splitting the articles into sentences.")
Expand Down Expand Up @@ -196,7 +196,7 @@ def create_data(do_lower_case, max_seq_length, max_predictions_per_seq,
logger.info("=" * 50)
logger.info("Skip text formatting, formatted file: %s" %
args.formatted_file)
formatted_files = args.formatted_file
formatted_files = [args.formatted_file]

sharding_output_name_prefix = do_text_sharding(
args.model_name, formatted_files, args.output_dir, args.n_train_shards,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@
class BookscorpusTextFormatter:
def __init__(self, save_path):
self.bookcorpus_ds = load_dataset("bookcorpus")
self.books_path = books_path
self.save_path = save_path
self.formatted_file = os.path.join(self.save_path, "book_formatted.txt")
self.merge()

# This puts one book per line
def merge(self):
with open(self.formatted_file, mode='w', newline='\n') as ofile:
for data in self.bookcorpus_ds['train']:
text = data['text']
if text.strip() != "":
ofile.write(line.strip() + ' \n')
ofile.write(text.strip() + ' \n')

0 comments on commit c2d8584

Please sign in to comment.