[tune] Example for using huggingface hyperparamer_search API (#11158)

This commit is contained in:
Kai Fricke 2020-10-02 00:00:57 +01:00 committed by GitHub
parent a866be381c
commit 681c24754a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 212 deletions

View file

@ -4,5 +4,4 @@ pbt_transformers_example
~~~~~~~~~~~~~~~~~~~~~~~~
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/pbt_transformers.py
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/trainer.py
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/utils.py

View file

@ -1,112 +1,25 @@
"""
This example is uses the official
huggingface transformers `hyperparameter_search` API.
"""
import os
import ray
from ray.tune import CLIReporter
from ray.tune.integration.wandb import wandb_mixin # noqa: F401
from ray.tune.schedulers import PopulationBasedTraining
from ray import tune
from ray.tune.examples.pbt_transformers.utils import \
build_compute_metrics_fn, download_data
from ray.tune.examples.pbt_transformers import trainer
from transformers import (AutoConfig, AutoModelForSequenceClassification,
AutoTokenizer, GlueDataset, GlueDataTrainingArguments
as DataTrainingArguments, glue_tasks_num_labels,
Trainer, TrainingArguments)
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import download_data, \
build_compute_metrics_fn
from ray.tune.schedulers import PopulationBasedTraining
from transformers import glue_tasks_num_labels, AutoConfig, \
AutoModelForSequenceClassification, AutoTokenizer, Trainer, GlueDataset, \
GlueDataTrainingArguments, TrainingArguments
def get_trainer(model_name_or_path, train_dataset, eval_dataset, task_name,
training_args):
try:
num_labels = glue_tasks_num_labels[task_name]
except KeyError:
raise ValueError("Task not found: %s" % (task_name))
config = AutoConfig.from_pretrained(
model_name_or_path, num_labels=num_labels, finetuning_task=task_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name_or_path,
config=config,
)
tune_trainer = trainer.TuneTransformerTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=build_compute_metrics_fn(task_name))
return tune_trainer
def recover_checkpoint(tune_checkpoint_dir, model_name=None):
if tune_checkpoint_dir is None or len(tune_checkpoint_dir) == 0:
return model_name
# Get subdirectory used for Huggingface.
subdirs = [
os.path.join(tune_checkpoint_dir, name)
for name in os.listdir(tune_checkpoint_dir)
if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
]
# There should only be 1 subdir.
assert len(subdirs) == 1, subdirs
return subdirs[0]
# __train_begin__
# Uncomment this line to use W&B!
# @wandb_mixin
def train_transformer(config, checkpoint_dir=None):
data_args = DataTrainingArguments(
task_name=config["task_name"], data_dir=config["data_dir"])
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
train_dataset = GlueDataset(
data_args,
tokenizer=tokenizer,
mode="train",
cache_dir=config["data_dir"])
eval_dataset = GlueDataset(
data_args,
tokenizer=tokenizer,
mode="dev",
cache_dir=config["data_dir"])
eval_dataset = eval_dataset[:len(eval_dataset) // 2]
training_args = TrainingArguments(
output_dir=tune.get_trial_dir(),
learning_rate=config["learning_rate"],
do_train=True,
do_eval=True,
evaluate_during_training=True,
eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
1,
# We explicitly set save to 0, and do saving in evaluate instead
save_steps=0,
num_train_epochs=config["num_epochs"],
max_steps=config["max_steps"],
per_device_train_batch_size=config["per_gpu_train_batch_size"],
per_device_eval_batch_size=config["per_gpu_val_batch_size"],
warmup_steps=0,
weight_decay=config["weight_decay"],
logging_dir="./logs",
)
tune_trainer = get_trainer(
recover_checkpoint(checkpoint_dir, config["model_name"]),
train_dataset, eval_dataset, config["task_name"], training_args)
tune_trainer.train(
recover_checkpoint(checkpoint_dir, config["model_name"]))
# __train_end__
# __tune_begin__
def tune_transformer(num_samples=8,
gpus_per_trial=0,
smoke_test=False,
ray_address=None):
ray.init(ray_address, log_to_driver=False)
ray.init(ray_address, log_to_driver=True)
data_dir_name = "./data" if not smoke_test else "./test_data"
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
if not os.path.exists(data_dir):
@ -119,34 +32,73 @@ def tune_transformer(num_samples=8,
task_data_dir = os.path.join(data_dir, task_name.upper())
num_labels = glue_tasks_num_labels[task_name]
config = AutoConfig.from_pretrained(
model_name, num_labels=num_labels, finetuning_task=task_name)
# Download and cache tokenizer, model, and features
print("Downloading and caching Tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Triggers tokenizer download to cache
AutoTokenizer.from_pretrained(model_name)
print("Downloading and caching pre-trained model")
AutoModelForSequenceClassification.from_pretrained(
model_name,
config=config,
)
# Triggers model download to cache
AutoModelForSequenceClassification.from_pretrained(model_name)
def get_model():
return AutoModelForSequenceClassification.from_pretrained(
model_name,
config=config,
)
# Download data.
download_data(task_name, data_dir)
config = {
"model_name": model_name,
"task_name": task_name,
"data_dir": task_data_dir,
"per_gpu_val_batch_size": 32,
"per_gpu_train_batch_size": tune.choice([16, 32, 64]),
"learning_rate": tune.uniform(1e-5, 5e-5),
"weight_decay": tune.uniform(0.0, 0.3),
"num_epochs": tune.choice([2, 3, 4, 5]),
data_args = GlueDataTrainingArguments(
task_name=task_name, data_dir=task_data_dir)
train_dataset = GlueDataset(
data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir)
eval_dataset = GlueDataset(
data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir)
training_args = TrainingArguments(
output_dir=".",
learning_rate=1e-5, # config
do_train=True,
do_eval=True,
evaluate_during_training=True,
eval_steps=(len(train_dataset) // 16) + 1
if not smoke_test else 1, # config
save_steps=(len(train_dataset) // 16) + 1
if not smoke_test else 1, # config,
num_train_epochs=2, # config
max_steps=-1,
per_device_train_batch_size=16, # config
per_device_eval_batch_size=16, # config
warmup_steps=0,
weight_decay=0.1, # config
logging_dir="./logs",
)
trainer = Trainer(
model_init=get_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=build_compute_metrics_fn(task_name))
tune_config = {
"per_device_eval_batch_size": 32,
"eval_steps": tune.sample_from(
lambda spec: len(train_dataset) // spec.config["per_device_train_batch_size"] + 1 # noqa: E501
) if not smoke_test else 1,
"save_steps": tune.sample_from(lambda spec: spec.config["eval_steps"]),
"num_train_epochs": tune.choice([2, 3, 4, 5]),
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
"wandb": {
"project": "pbt_transformers",
"reinit": True,
"allow_val_change": True
}
}
scheduler = PopulationBasedTraining(
@ -157,70 +109,36 @@ def tune_transformer(num_samples=8,
hyperparam_mutations={
"weight_decay": tune.uniform(0.0, 0.3),
"learning_rate": tune.uniform(1e-5, 5e-5),
"per_gpu_train_batch_size": [16, 32, 64],
"per_device_train_batch_size": [16, 32, 64],
})
reporter = CLIReporter(
parameter_columns={
"weight_decay": "w_decay",
"learning_rate": "lr",
"per_gpu_train_batch_size": "train_bs/gpu",
"per_device_train_batch_size": "train_bs/gpu",
"num_epochs": "num_epochs"
},
metric_columns=[
"eval_acc", "eval_loss", "epoch", "training_iteration"
])
analysis = tune.run(
train_transformer,
trainer.hyperparameter_search(
hp_space=lambda _: tune_config,
backend="ray",
n_trials=num_samples,
resources_per_trial={
"cpu": 1,
"gpu": gpus_per_trial
},
config=config,
num_samples=num_samples,
scheduler=scheduler,
keep_checkpoints_num=3,
checkpoint_score_attr="training_iteration",
stop={"training_iteration": 1} if smoke_test else None,
progress_reporter=reporter,
local_dir="~/ray_results/",
name="tune_transformer_pbt")
if not smoke_test:
test_best_model(analysis, config["model_name"], config["task_name"],
config["data_dir"])
# __tune_end__
def test_best_model(analysis, model_name, task_name, data_dir):
data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name)
best_config = analysis.get_best_config(metric="eval_acc", mode="max")
print(best_config)
best_checkpoint = recover_checkpoint(
analysis.get_best_trial(metric="eval_acc",
mode="max").checkpoint.value)
print(best_checkpoint)
best_model = AutoModelForSequenceClassification.from_pretrained(
best_checkpoint).to("cuda")
test_args = TrainingArguments(output_dir="./best_model_results", )
test_dataset = GlueDataset(
data_args, tokenizer=tokenizer, mode="dev", cache_dir=data_dir)
test_dataset = test_dataset[len(test_dataset) // 2:]
test_trainer = Trainer(
best_model,
test_args,
compute_metrics=build_compute_metrics_fn(task_name))
metrics = test_trainer.evaluate(test_dataset)
print(metrics)
name="tune_transformer_pbt",
log_to_file=True)
if __name__ == "__main__":

View file

@ -1,52 +0,0 @@
import logging
import os
from typing import Dict, Optional, Tuple
from ray import tune
import transformers
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
import torch
from torch.utils.data import Dataset
logger = logging.getLogger(__name__)
"""A Trainer class integrated with Tune.
The only changes to the original transformers.Trainer are:
- Report eval metrics to Tune
- Save state using Tune's checkpoint directories
"""
class TuneTransformerTrainer(transformers.Trainer):
def get_optimizers(
self, num_training_steps: int
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
self.current_optimizer, self.current_scheduler = super(
).get_optimizers(num_training_steps)
return (self.current_optimizer, self.current_scheduler)
def evaluate(self,
eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
eval_dataloader = self.get_eval_dataloader(eval_dataset)
output = self._prediction_loop(
eval_dataloader, description="Evaluation")
self._log(output.metrics)
self.save_state()
tune.report(**output.metrics)
return output.metrics
def save_state(self):
with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
self.args.output_dir = checkpoint_dir
# This is the directory name that Huggingface requires.
output_dir = os.path.join(
self.args.output_dir,
f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
self.save_model(output_dir)
if self.is_world_master():
torch.save(self.current_optimizer.state_dict(),
os.path.join(output_dir, "optimizer.pt"))
torch.save(self.current_scheduler.state_dict(),
os.path.join(output_dir, "scheduler.pt"))

View file

@ -25,7 +25,8 @@ tensorflow-probability
timm
torch>=1.5.0
torchvision>=0.6.0
transformers
# transformers
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
wandb
xgboost