tma15 · tma15 · Feb 11, 2024 · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 *egg-info
 build
 dist
+.mypy_cache
 
 *cpp
 

diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
 # Bunruija
+[![PyPI version](https://badge.fury.io/py/bunruija.svg)](https://badge.fury.io/py/bunruija)
+
 Bunruija is a text classification toolkit.
 Bunruija aims at enabling pre-processing, training and evaluation of text classification models with **minimum coding effort**.
 Bunruija is mainly focusing on Japanese though it is also applicable to other languages.
@@ -20,9 +22,9 @@ Example of `sklearn.svm.SVC`
 
 ```yaml
 data:
-  train: train.csv
-  dev: dev.csv
-  test: test.csv
+  train: train.jsonl
+  dev: dev.jsonl
+  test: test.jsonl
 
 output_dir: models/svm-model
 
@@ -51,9 +53,9 @@ Example of BERT
 
 ```yaml
 data:
-  train: train.csv
-  dev: dev.csv
-  test: test.csv
+  train: train.jsonl
+  dev: dev.jsonl
+  test: test.jsonl
 
 output_dir: models/transformer-model
 
@@ -94,9 +96,9 @@ You can set data-related settings in `data`.
 
 ```yaml
 data:
-  train: train.csv  # training data
-  dev: dev.csv # development data
-  test: test.csv # test data
+  train: train.jsonl  # training data
+  dev: dev.jsonl # development data
+  test: test.jsonl # test data
   label_column: label
   text_column: text
 ```
@@ -127,8 +129,28 @@ Format of `jsonl`:
 ```
 
 ### pipeline
-You can set pipeline of your model in `pipeline`
+You can set pipeline of your model in `pipeline` section.
+It is a list of components that are used in your model.
+
+For each component, `type` is a module path and `args` is arguments for the module.
+For instance, when you set the first component as follows, [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) is instanciated with given arguments, and then applied to data at first in your model.
 
+```yaml
+  - type: sklearn.feature_extraction.text.TfidfVectorizer
+    args:
+      tokenizer:
+        type: bunruija.tokenizers.mecab_tokenizer.MeCabTokenizer
+        args:
+          lemmatize: true
+          exclude_pos:
+            - 助詞
+            - 助動詞
+      max_features: 10000
+      min_df: 3
+      ngram_range:
+        - 1
+        - 3
+```
 
 ## Prediction using the trained classifier in Python code
 After you trained a classification model, you can use that model for prediction as follows:

diff --git a/example/jglue/jcola/README.md b/example/jglue/jcola/README.md
@@ -0,0 +1,38 @@
+# Evaluation Results
+
+## Linear SVM
+### Config
+```yaml
+pipeline:
+  - type: sklearn.feature_extraction.text.TfidfVectorizer
+    args:
+      tokenizer:
+        type: bunruija.tokenizers.mecab_tokenizer.MeCabTokenizer
+        args:
+          lemmatize: true
+          exclude_pos:
+            - 助詞
+            - 助動詞
+      max_features: 10000
+      min_df: 3
+      ngram_range:
+        - 1
+        - 3
+  - type: sklearn.svm.LinearSVC
+    args:
+      verbose: 10
+      C: 10.
+```
+
+### Results
+```
+F-score on dev: 0.7514450867052023
+              precision    recall  f1-score   support
+
+  acceptable       0.86      0.85      0.85       733
+unacceptable       0.20      0.21      0.21       132
+
+   accuracy                            0.75       865
+   macro avg       0.53      0.53      0.53       865
+weighted avg       0.76      0.75      0.75       865
+```
diff --git a/example/jglue/jcola/create_jcola_data.py b/example/jglue/jcola/create_jcola_data.py
@@ -0,0 +1,36 @@
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+
+from datasets import Dataset, load_dataset
+from loguru import logger  # type: ignore
+
+
+def write_json(ds: Dataset, name: Path):
+    with open(name, "w") as f:
+        for sample in ds:
+            category: str = ds.features["label"].names[sample["label"]]
+            sample_ = {
+                "text": sample["sentence"],
+                "label": category,
+            }
+            print(json.dumps(sample_), file=f)
+        logger.info(f"{name}")
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--output_dir", default="example/jglue/jcola/data", type=Path)
+    args = parser.parse_args()
+
+    if not args.output_dir.exists():
+        args.output_dir.mkdir(parents=True)
+
+    dataset = load_dataset("shunk031/JGLUE", name="JCoLA")
+
+    write_json(dataset["train"], args.output_dir / "train.jsonl")
+    write_json(dataset["validation"], args.output_dir / "dev.jsonl")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/example/jglue/marc_ja/README.md b/example/jglue/marc_ja/README.md
@@ -0,0 +1,38 @@
+# Evaluation Results
+
+## Linear SVM
+### Config
+```yaml
+pipeline:
+  - type: sklearn.feature_extraction.text.TfidfVectorizer
+    args:
+      tokenizer:
+        type: bunruija.tokenizers.mecab_tokenizer.MeCabTokenizer
+        args:
+          lemmatize: true
+          exclude_pos:
+            - 助詞
+            - 助動詞
+      max_features: 10000
+      min_df: 3
+      ngram_range:
+        - 1
+        - 3
+  - type: sklearn.svm.LinearSVC
+    args:
+      verbose: 10
+      C: 10.
+```
+
+### Results
+```
+F-score on dev: 0.9225327201980899
+              precision    recall  f1-score   support
+
+    negative       0.56      0.85      0.68       542
+    positive       0.98      0.93      0.96      5112
+
+    accuracy                           0.92      5654
+   macro avg       0.77      0.89      0.82      5654
+weighted avg       0.94      0.92      0.93      5654
+```
diff --git a/example/jglue/marc_ja/create_marc_ja_data.py b/example/jglue/marc_ja/create_marc_ja_data.py
@@ -0,0 +1,36 @@
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+
+from datasets import Dataset, load_dataset
+from loguru import logger  # type: ignore
+
+
+def write_json(ds: Dataset, name: Path):
+    with open(name, "w") as f:
+        for sample in ds:
+            category: str = ds.features["label"].names[sample["label"]]
+            sample_ = {
+                "text": sample["sentence"],
+                "label": category,
+            }
+            print(json.dumps(sample_), file=f)
+        logger.info(f"{name}")
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--output_dir", default="example/jglue/jcola/data", type=Path)
+    args = parser.parse_args()
+
+    if not args.output_dir.exists():
+        args.output_dir.mkdir(parents=True)
+
+    dataset = load_dataset("shunk031/JGLUE", name="MARC-ja")
+
+    write_json(dataset["train"], args.output_dir / "train.jsonl")
+    write_json(dataset["validation"], args.output_dir / "dev.jsonl")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/example/jglue/settings/svm.yaml b/example/jglue/settings/svm.yaml
@@ -0,0 +1,25 @@
+data:
+  train: data/train.jsonl
+  dev: data/dev.jsonl
+
+output_dir: models/svm-model
+
+pipeline:
+  - type: sklearn.feature_extraction.text.TfidfVectorizer
+    args:
+      tokenizer:
+        type: bunruija.tokenizers.mecab_tokenizer.MeCabTokenizer
+        args:
+          lemmatize: true
+          exclude_pos:
+            - 助詞
+            - 助動詞
+      max_features: 10000
+      min_df: 3
+      ngram_range:
+        - 1
+        - 3
+  - type: sklearn.svm.LinearSVC
+    args:
+      verbose: 10
+      C: 10.
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     *egg-info
     build
     dist
+    .mypy_cache
     *cpp
@@ Expand Down @@