Skip to content

Commit

Permalink
Merge pull request #6 from tboquet/tasknet-4-5/load-pipeline-improvem…
Browse files Browse the repository at this point in the history
…ents

tasknet-5-6/load-pipeline-improvements
  • Loading branch information
sileod authored Sep 19, 2023
2 parents 2d1c49e + 48d8e1a commit 35a5d36
Showing 1 changed file with 124 additions and 65 deletions.
189 changes: 124 additions & 65 deletions src/tasknet/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
from datasets import DatasetDict, Dataset, load_dataset
from easydict import EasyDict as edict
import copy
import functools
from tqdm.auto import tqdm
from datasets import concatenate_datasets

import funcy as fc
import torch
import magicattr
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from easydict import EasyDict as edict
from tqdm.auto import tqdm
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TextClassificationPipeline,
)


class NoTqdm:
def __enter__(self):
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)

def __exit__(self, exc_type, exc_value, exc_traceback):
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=False)


def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0):
train_testvalid = dataset.train_test_split(test_size=1 - train_ratio, seed=seed)
test_valid = train_testvalid["test"].train_test_split(test_size=val_test_ratio, seed=seed)
test_valid = train_testvalid["test"].train_test_split(
test_size=val_test_ratio, seed=seed
)
dataset = DatasetDict(
train=train_testvalid["train"],
validation=test_valid["test"],
Expand All @@ -25,32 +35,39 @@ def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, se
return dataset


def load_dataset_sample(*args,n=1000):
ds= load_dataset(*args,streaming=True)
return DatasetDict({k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds})
def load_dataset_sample(*args, n=1000):
ds = load_dataset(*args, streaming=True)
return DatasetDict(
{k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds}
)


def to_dict(x):
if hasattr(x,'items'):
if hasattr(x, "items"):
return edict(x)
else:
x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')})
x = edict({a: getattr(x, a) for a in dir(x) if not a.startswith("__")})
return x


def deep_copy_cache(function):
memo = {}
def wrapper(*args, **kwargs):
if args in memo:
return copy.deepcopy(memo[args])
else:
rv = function(*args, **kwargs)
memo[args] = rv
return rv
return wrapper
memo = {}

def wrapper(*args, **kwargs):
if args in memo:
return copy.deepcopy(memo[args])
else:
rv = function(*args, **kwargs)
memo[args] = rv
return rv

return wrapper


def shallow_copy_A_to_B(A, B):
"""Shallow copy (=parameter sharing) A into B
https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427"""
https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
"""

def rsetattr(obj, attr, val):
pre, _, post = attr.rpartition(".")
Expand All @@ -66,39 +83,43 @@ def _getattr(obj, attr):
rsetattr(B, nb, rgetattr(A, na))
return A, B


def normalize_label(label):
label=str(label).lower()
label=label.replace('-','_')
label=label.replace(' ','_')
label=label.replace('entailed', 'entailment')
label=label.replace('non_','not_')
label=label.replace('duplicate','equivalent')
label=label.replace('neg','negative')
label=label.replace('pos','positive')
label = str(label).lower()
label = label.replace("-", "_")
label = label.replace(" ", "_")
label = label.replace("entailed", "entailment")
label = label.replace("non_", "not_")
label = label.replace("duplicate", "equivalent")
label = label.replace("neg", "negative")
label = label.replace("pos", "positive")
return label


def merge_tasks(tasks,names):
def merge_tasks(tasks, names):
prev, done, to_delete = dict(), dict(), []
for i,t in tqdm(enumerate(tasks)):
x=[x for x in names if x in t.name]
for i, t in tqdm(enumerate(tasks)):
x = [x for x in names if x in t.name]
if x:
x=x[0]
columns=t.dataset['train'].features.keys()
n_choices = len([c for c in columns if 'choice' in c])
x = x[0]
columns = t.dataset["train"].features.keys()
n_choices = len([c for c in columns if "choice" in c])
if n_choices:
x=f"{x}-{n_choices}"
x = f"{x}-{n_choices}"
if x in prev:
t.dataset=DatasetDict(fc.merge_with(concatenate_datasets, prev[x], t.dataset))
prev[x]=t.dataset
t.name=x
done[x]=t
to_delete+=[i]
tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(done.values())
t.dataset = DatasetDict(
fc.merge_with(concatenate_datasets, prev[x], t.dataset)
)
prev[x] = t.dataset
t.name = x
done[x] = t
to_delete += [i]
tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(
done.values()
)
return tasks



def nested_children(m: torch.nn.Module):
children = dict(m.named_children())
output = {}
Expand All @@ -107,57 +128,95 @@ def nested_children(m: torch.nn.Module):
else:
for name, child in children.items():
if name.isnumeric():
name=f'[{name}]'
name = f"[{name}]"
try:
output[name] = nested_children(child)
except TypeError:
output[name] = nested_children(child)
return output


def convert(d):
for k, v in d.items():
if isinstance(v, dict):
yield from (f'{k}.{x}'.replace('.[','[') for x in convert(v))
yield from (f"{k}.{x}".replace(".[", "[") for x in convert(v))
else:
yield k

def search_module(m,name, mode='attr', lowercase=True):

def search_module(m, name, mode="attr", lowercase=True):
paths = convert(nested_children(m))
module_name = lambda x: magicattr.get(m,x).__class__.__name__
module_name = lambda x: magicattr.get(m, x).__class__.__name__
process = lambda x: x.lower() if lowercase else x
name=process(name)
if mode=='attr':
name = process(name)
if mode == "attr":
return [x for x in paths if name in process(x)]
if mode=='class':
if mode == "class":
return [x for x in paths if name in process(module_name(x))]
else:
raise ValueError('mode must be "attr" or "class"')


def load_pipeline(model_name, task_name, adapt_task_embedding=True,multilingual=False):
if multilingual or 'mdeberta' in model_name:
multilingual=True
def load_pipeline(
model_name: str,
task_name: str,
adapt_task_embedding: bool = True,
multilingual: bool = False,
device: int = -1,
return_all_scores: bool = False,
) -> TextClassificationPipeline:
"""Load Text Classification Pipeline for a Specified Model.
Load a text classification pipeline for the specified model and task. If
the model is multilingual or has "mdeberta" in its name, it will handle
the multilingual settings. The pipeline will have a model that's adapted
to the task using an adapter.
Args:
model_name (str): Name of the model to be loaded.
task_name (str): Name of the task for which the pipeline is loaded.
adapt_task_embedding (bool, optional): Flag to determine if task
embedding should be adapted. Defaults to True.
multilingual (bool, optional): Flag to determine if the model is
multilingual. Defaults to False.
device (int, optional): The device to run the pipeline on (-1 for CPU,
>= 0 for GPU ids). Defaults to -1.
Returns:
TextClassificationPipeline: Loaded text classification pipeline.
"""
if multilingual or "mdeberta" in model_name:
multilingual = True

from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer
from .models import Adapter

try:
import tasksource
except:
raise ImportError('Requires tasksource.\n pip install tasksource')
task = tasksource.load_task(task_name,multilingual=multilingual)
raise ImportError("Requires tasksource.\n pip install tasksource")
task = tasksource.load_task(task_name, multilingual=multilingual)

model = AutoModelForSequenceClassification.from_pretrained(model_name,ignore_mismatched_sizes=True)
adapter = Adapter.from_pretrained(model_name.replace('-nli','')+'-adapters')
model = AutoModelForSequenceClassification.from_pretrained(
model_name, ignore_mismatched_sizes=True
)
adapter = Adapter.from_pretrained(model_name.replace("-nli", "") + "-adapters")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = adapter.adapt_model_to_task(model, task_name)
model.config.id2label=task['train'].features['labels']._int2str
model.config.id2label = task["train"].features["labels"]._int2str

task_index = adapter.config.tasks.index(task_name)

if adapt_task_embedding:
with torch.no_grad():
model.deberta.embeddings.word_embeddings.weight[tokenizer.cls_token_id]+=adapter.Z[task_index]
model.deberta.embeddings.word_embeddings.weight[
tokenizer.cls_token_id
] += adapter.Z[task_index]

pipe = TextClassificationPipeline(
model=model, tokenizer=tokenizer)
return pipe
model=model,
tokenizer=tokenizer,
device=device,
return_all_scores=return_all_scores,
)
return pipe

0 comments on commit 35a5d36

Please sign in to comment.