mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Example for using huggingface hyperparamer_search API (#11158)
This commit is contained in:
parent
a866be381c
commit
681c24754a
4 changed files with 78 additions and 212 deletions
|
@ -4,5 +4,4 @@ pbt_transformers_example
|
|||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/pbt_transformers.py
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/trainer.py
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/utils.py
|
|
@ -1,112 +1,25 @@
|
|||
"""
|
||||
This example is uses the official
|
||||
huggingface transformers `hyperparameter_search` API.
|
||||
"""
|
||||
import os
|
||||
|
||||
import ray
|
||||
from ray.tune import CLIReporter
|
||||
from ray.tune.integration.wandb import wandb_mixin # noqa: F401
|
||||
from ray.tune.schedulers import PopulationBasedTraining
|
||||
|
||||
from ray import tune
|
||||
from ray.tune.examples.pbt_transformers.utils import \
|
||||
build_compute_metrics_fn, download_data
|
||||
from ray.tune.examples.pbt_transformers import trainer
|
||||
|
||||
from transformers import (AutoConfig, AutoModelForSequenceClassification,
|
||||
AutoTokenizer, GlueDataset, GlueDataTrainingArguments
|
||||
as DataTrainingArguments, glue_tasks_num_labels,
|
||||
Trainer, TrainingArguments)
|
||||
from ray.tune import CLIReporter
|
||||
from ray.tune.examples.pbt_transformers.utils import download_data, \
|
||||
build_compute_metrics_fn
|
||||
from ray.tune.schedulers import PopulationBasedTraining
|
||||
from transformers import glue_tasks_num_labels, AutoConfig, \
|
||||
AutoModelForSequenceClassification, AutoTokenizer, Trainer, GlueDataset, \
|
||||
GlueDataTrainingArguments, TrainingArguments
|
||||
|
||||
|
||||
def get_trainer(model_name_or_path, train_dataset, eval_dataset, task_name,
|
||||
training_args):
|
||||
try:
|
||||
num_labels = glue_tasks_num_labels[task_name]
|
||||
except KeyError:
|
||||
raise ValueError("Task not found: %s" % (task_name))
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_name_or_path, num_labels=num_labels, finetuning_task=task_name)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
model_name_or_path,
|
||||
config=config,
|
||||
)
|
||||
tune_trainer = trainer.TuneTransformerTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=build_compute_metrics_fn(task_name))
|
||||
|
||||
return tune_trainer
|
||||
|
||||
|
||||
def recover_checkpoint(tune_checkpoint_dir, model_name=None):
|
||||
if tune_checkpoint_dir is None or len(tune_checkpoint_dir) == 0:
|
||||
return model_name
|
||||
# Get subdirectory used for Huggingface.
|
||||
subdirs = [
|
||||
os.path.join(tune_checkpoint_dir, name)
|
||||
for name in os.listdir(tune_checkpoint_dir)
|
||||
if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
|
||||
]
|
||||
# There should only be 1 subdir.
|
||||
assert len(subdirs) == 1, subdirs
|
||||
return subdirs[0]
|
||||
|
||||
|
||||
# __train_begin__
|
||||
# Uncomment this line to use W&B!
|
||||
# @wandb_mixin
|
||||
def train_transformer(config, checkpoint_dir=None):
|
||||
data_args = DataTrainingArguments(
|
||||
task_name=config["task_name"], data_dir=config["data_dir"])
|
||||
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
|
||||
train_dataset = GlueDataset(
|
||||
data_args,
|
||||
tokenizer=tokenizer,
|
||||
mode="train",
|
||||
cache_dir=config["data_dir"])
|
||||
eval_dataset = GlueDataset(
|
||||
data_args,
|
||||
tokenizer=tokenizer,
|
||||
mode="dev",
|
||||
cache_dir=config["data_dir"])
|
||||
eval_dataset = eval_dataset[:len(eval_dataset) // 2]
|
||||
training_args = TrainingArguments(
|
||||
output_dir=tune.get_trial_dir(),
|
||||
learning_rate=config["learning_rate"],
|
||||
do_train=True,
|
||||
do_eval=True,
|
||||
evaluate_during_training=True,
|
||||
eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
|
||||
1,
|
||||
# We explicitly set save to 0, and do saving in evaluate instead
|
||||
save_steps=0,
|
||||
num_train_epochs=config["num_epochs"],
|
||||
max_steps=config["max_steps"],
|
||||
per_device_train_batch_size=config["per_gpu_train_batch_size"],
|
||||
per_device_eval_batch_size=config["per_gpu_val_batch_size"],
|
||||
warmup_steps=0,
|
||||
weight_decay=config["weight_decay"],
|
||||
logging_dir="./logs",
|
||||
)
|
||||
|
||||
tune_trainer = get_trainer(
|
||||
recover_checkpoint(checkpoint_dir, config["model_name"]),
|
||||
train_dataset, eval_dataset, config["task_name"], training_args)
|
||||
tune_trainer.train(
|
||||
recover_checkpoint(checkpoint_dir, config["model_name"]))
|
||||
|
||||
|
||||
# __train_end__
|
||||
|
||||
|
||||
# __tune_begin__
|
||||
def tune_transformer(num_samples=8,
|
||||
gpus_per_trial=0,
|
||||
smoke_test=False,
|
||||
ray_address=None):
|
||||
ray.init(ray_address, log_to_driver=False)
|
||||
ray.init(ray_address, log_to_driver=True)
|
||||
data_dir_name = "./data" if not smoke_test else "./test_data"
|
||||
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
|
||||
if not os.path.exists(data_dir):
|
||||
|
@ -119,34 +32,73 @@ def tune_transformer(num_samples=8,
|
|||
|
||||
task_data_dir = os.path.join(data_dir, task_name.upper())
|
||||
|
||||
num_labels = glue_tasks_num_labels[task_name]
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_name, num_labels=num_labels, finetuning_task=task_name)
|
||||
|
||||
# Download and cache tokenizer, model, and features
|
||||
print("Downloading and caching Tokenizer")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Triggers tokenizer download to cache
|
||||
AutoTokenizer.from_pretrained(model_name)
|
||||
print("Downloading and caching pre-trained model")
|
||||
AutoModelForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# Triggers model download to cache
|
||||
AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||
def get_model():
|
||||
return AutoModelForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# Download data.
|
||||
download_data(task_name, data_dir)
|
||||
|
||||
config = {
|
||||
"model_name": model_name,
|
||||
"task_name": task_name,
|
||||
"data_dir": task_data_dir,
|
||||
"per_gpu_val_batch_size": 32,
|
||||
"per_gpu_train_batch_size": tune.choice([16, 32, 64]),
|
||||
"learning_rate": tune.uniform(1e-5, 5e-5),
|
||||
"weight_decay": tune.uniform(0.0, 0.3),
|
||||
"num_epochs": tune.choice([2, 3, 4, 5]),
|
||||
data_args = GlueDataTrainingArguments(
|
||||
task_name=task_name, data_dir=task_data_dir)
|
||||
|
||||
train_dataset = GlueDataset(
|
||||
data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir)
|
||||
eval_dataset = GlueDataset(
|
||||
data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir)
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=".",
|
||||
learning_rate=1e-5, # config
|
||||
do_train=True,
|
||||
do_eval=True,
|
||||
evaluate_during_training=True,
|
||||
eval_steps=(len(train_dataset) // 16) + 1
|
||||
if not smoke_test else 1, # config
|
||||
save_steps=(len(train_dataset) // 16) + 1
|
||||
if not smoke_test else 1, # config,
|
||||
num_train_epochs=2, # config
|
||||
max_steps=-1,
|
||||
per_device_train_batch_size=16, # config
|
||||
per_device_eval_batch_size=16, # config
|
||||
warmup_steps=0,
|
||||
weight_decay=0.1, # config
|
||||
logging_dir="./logs",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model_init=get_model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=build_compute_metrics_fn(task_name))
|
||||
|
||||
tune_config = {
|
||||
"per_device_eval_batch_size": 32,
|
||||
"eval_steps": tune.sample_from(
|
||||
lambda spec: len(train_dataset) // spec.config["per_device_train_batch_size"] + 1 # noqa: E501
|
||||
) if not smoke_test else 1,
|
||||
"save_steps": tune.sample_from(lambda spec: spec.config["eval_steps"]),
|
||||
"num_train_epochs": tune.choice([2, 3, 4, 5]),
|
||||
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
|
||||
"wandb": {
|
||||
"project": "pbt_transformers",
|
||||
"reinit": True,
|
||||
"allow_val_change": True
|
||||
}
|
||||
}
|
||||
|
||||
scheduler = PopulationBasedTraining(
|
||||
|
@ -157,70 +109,36 @@ def tune_transformer(num_samples=8,
|
|||
hyperparam_mutations={
|
||||
"weight_decay": tune.uniform(0.0, 0.3),
|
||||
"learning_rate": tune.uniform(1e-5, 5e-5),
|
||||
"per_gpu_train_batch_size": [16, 32, 64],
|
||||
"per_device_train_batch_size": [16, 32, 64],
|
||||
})
|
||||
|
||||
reporter = CLIReporter(
|
||||
parameter_columns={
|
||||
"weight_decay": "w_decay",
|
||||
"learning_rate": "lr",
|
||||
"per_gpu_train_batch_size": "train_bs/gpu",
|
||||
"per_device_train_batch_size": "train_bs/gpu",
|
||||
"num_epochs": "num_epochs"
|
||||
},
|
||||
metric_columns=[
|
||||
"eval_acc", "eval_loss", "epoch", "training_iteration"
|
||||
])
|
||||
|
||||
analysis = tune.run(
|
||||
train_transformer,
|
||||
trainer.hyperparameter_search(
|
||||
hp_space=lambda _: tune_config,
|
||||
backend="ray",
|
||||
n_trials=num_samples,
|
||||
resources_per_trial={
|
||||
"cpu": 1,
|
||||
"gpu": gpus_per_trial
|
||||
},
|
||||
config=config,
|
||||
num_samples=num_samples,
|
||||
scheduler=scheduler,
|
||||
keep_checkpoints_num=3,
|
||||
checkpoint_score_attr="training_iteration",
|
||||
stop={"training_iteration": 1} if smoke_test else None,
|
||||
progress_reporter=reporter,
|
||||
local_dir="~/ray_results/",
|
||||
name="tune_transformer_pbt")
|
||||
|
||||
if not smoke_test:
|
||||
test_best_model(analysis, config["model_name"], config["task_name"],
|
||||
config["data_dir"])
|
||||
|
||||
|
||||
# __tune_end__
|
||||
|
||||
|
||||
def test_best_model(analysis, model_name, task_name, data_dir):
|
||||
data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
best_config = analysis.get_best_config(metric="eval_acc", mode="max")
|
||||
print(best_config)
|
||||
best_checkpoint = recover_checkpoint(
|
||||
analysis.get_best_trial(metric="eval_acc",
|
||||
mode="max").checkpoint.value)
|
||||
print(best_checkpoint)
|
||||
best_model = AutoModelForSequenceClassification.from_pretrained(
|
||||
best_checkpoint).to("cuda")
|
||||
|
||||
test_args = TrainingArguments(output_dir="./best_model_results", )
|
||||
test_dataset = GlueDataset(
|
||||
data_args, tokenizer=tokenizer, mode="dev", cache_dir=data_dir)
|
||||
test_dataset = test_dataset[len(test_dataset) // 2:]
|
||||
|
||||
test_trainer = Trainer(
|
||||
best_model,
|
||||
test_args,
|
||||
compute_metrics=build_compute_metrics_fn(task_name))
|
||||
|
||||
metrics = test_trainer.evaluate(test_dataset)
|
||||
print(metrics)
|
||||
name="tune_transformer_pbt",
|
||||
log_to_file=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
import logging
|
||||
import os
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
from ray import tune
|
||||
|
||||
import transformers
|
||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
||||
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
"""A Trainer class integrated with Tune.
|
||||
The only changes to the original transformers.Trainer are:
|
||||
- Report eval metrics to Tune
|
||||
- Save state using Tune's checkpoint directories
|
||||
"""
|
||||
|
||||
|
||||
class TuneTransformerTrainer(transformers.Trainer):
|
||||
def get_optimizers(
|
||||
self, num_training_steps: int
|
||||
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
|
||||
self.current_optimizer, self.current_scheduler = super(
|
||||
).get_optimizers(num_training_steps)
|
||||
return (self.current_optimizer, self.current_scheduler)
|
||||
|
||||
def evaluate(self,
|
||||
eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
|
||||
eval_dataloader = self.get_eval_dataloader(eval_dataset)
|
||||
output = self._prediction_loop(
|
||||
eval_dataloader, description="Evaluation")
|
||||
self._log(output.metrics)
|
||||
self.save_state()
|
||||
tune.report(**output.metrics)
|
||||
|
||||
return output.metrics
|
||||
|
||||
def save_state(self):
|
||||
with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
|
||||
self.args.output_dir = checkpoint_dir
|
||||
# This is the directory name that Huggingface requires.
|
||||
output_dir = os.path.join(
|
||||
self.args.output_dir,
|
||||
f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
|
||||
self.save_model(output_dir)
|
||||
if self.is_world_master():
|
||||
torch.save(self.current_optimizer.state_dict(),
|
||||
os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(self.current_scheduler.state_dict(),
|
||||
os.path.join(output_dir, "scheduler.pt"))
|
|
@ -25,7 +25,8 @@ tensorflow-probability
|
|||
timm
|
||||
torch>=1.5.0
|
||||
torchvision>=0.6.0
|
||||
transformers
|
||||
# transformers
|
||||
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
|
||||
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
|
||||
wandb
|
||||
xgboost
|
||||
|
|
Loading…
Add table
Reference in a new issue