This repository has been archived by the owner on Nov 10, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_labels.py
69 lines (59 loc) · 2.32 KB
/
generate_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import argparse
import os.path as path
BASE_DIR = "raw_data/trdg"
parser = argparse.ArgumentParser(description="Train a language OCR")
parser.add_argument(
"--lang",
help="language to train in ",
required=True,
choices=["eng", "yor", "igbo"],
)
args = parser.parse_args()
with open(path.join(BASE_DIR, f"{args.lang}_target.txt"), mode="w") as f:
for file in os.listdir(path.join(BASE_DIR, f"{args.lang}_image")):
new_file = file.replace(" ", "_")
print(new_file)
# print(" ".join(file.replace(".jpg", "").split("_")[:-1]))
# break
# os.rename(
# path.join(BASE_DIR, f"{args.lang}_image", file),
# path.join(BASE_DIR, f"{args.lang}_image", new_file),
# )
f.write(
new_file + " " + " ".join(file.replace(".jpg", "").split("_")[:-1]) + "\n"
)
# with open(path.join(BASE_DIR, f"{args.lang}_target.txt"), mode="r") as f:
# text = f.read()
# text = text.replace("\n", " ")
# print("".join(sorted(set(text))))
# wrong_char2 = " !\"#$%&'()*+,-.0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz©ÀÁÅÈÉÌÍÒÓÙÚàáèéëìíòóôõöùúüćėŃńōšǸǹʻˈː̣̀́ṢṣẸẹịỌọ–—‘’“”…←−▪►"
# wrong_char = ""
# with open(path.join(BASE_DIR, f"{args.lang}_target.txt"), mode="r") as f:
# text = f.read().split("\n")
# text_new = text.copy()
# for ccar in wrong_char:
# for label in text_new:
# if ccar in set(label):
# try:
# text.remove(label)
# os.remove(
# BASE_DIR + "/" + f"{args.lang}_image" + "/" + label.split()[0]
# )
# except:
# pass
# print(
# os.path.exists(
# BASE_DIR + "/" + f"{args.lang}_image" + "/" + label.split()[0]
# )
# )
# print(label)
# print(len(text_new))
# print(len(text))
# for file in os.listdir("raw_data/trdg/yor_image"):
# if "\u200b" in file:
# print(file)
# os.rename(
# os.path.join("raw_data/trdg/yor_image", file),
# os.path.join("raw_data/trdg/yor_image", file.replace("\u200b", "")),
# )