mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Example for using huggingface hyperparamer_search API (#11158)
This commit is contained in:
parent
a866be381c
commit
681c24754a
4 changed files with 78 additions and 212 deletions
|
@ -4,5 +4,4 @@ pbt_transformers_example
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/pbt_transformers.py
|
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/pbt_transformers.py
|
||||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/trainer.py
|
|
||||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/utils.py
|
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/utils.py
|
|
@ -1,112 +1,25 @@
|
||||||
|
"""
|
||||||
|
This example is uses the official
|
||||||
|
huggingface transformers `hyperparameter_search` API.
|
||||||
|
"""
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
from ray.tune import CLIReporter
|
|
||||||
from ray.tune.integration.wandb import wandb_mixin # noqa: F401
|
|
||||||
from ray.tune.schedulers import PopulationBasedTraining
|
|
||||||
|
|
||||||
from ray import tune
|
from ray import tune
|
||||||
from ray.tune.examples.pbt_transformers.utils import \
|
from ray.tune import CLIReporter
|
||||||
build_compute_metrics_fn, download_data
|
from ray.tune.examples.pbt_transformers.utils import download_data, \
|
||||||
from ray.tune.examples.pbt_transformers import trainer
|
build_compute_metrics_fn
|
||||||
|
from ray.tune.schedulers import PopulationBasedTraining
|
||||||
from transformers import (AutoConfig, AutoModelForSequenceClassification,
|
from transformers import glue_tasks_num_labels, AutoConfig, \
|
||||||
AutoTokenizer, GlueDataset, GlueDataTrainingArguments
|
AutoModelForSequenceClassification, AutoTokenizer, Trainer, GlueDataset, \
|
||||||
as DataTrainingArguments, glue_tasks_num_labels,
|
GlueDataTrainingArguments, TrainingArguments
|
||||||
Trainer, TrainingArguments)
|
|
||||||
|
|
||||||
|
|
||||||
def get_trainer(model_name_or_path, train_dataset, eval_dataset, task_name,
|
|
||||||
training_args):
|
|
||||||
try:
|
|
||||||
num_labels = glue_tasks_num_labels[task_name]
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError("Task not found: %s" % (task_name))
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(
|
|
||||||
model_name_or_path, num_labels=num_labels, finetuning_task=task_name)
|
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(
|
|
||||||
model_name_or_path,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
tune_trainer = trainer.TuneTransformerTrainer(
|
|
||||||
model=model,
|
|
||||||
args=training_args,
|
|
||||||
train_dataset=train_dataset,
|
|
||||||
eval_dataset=eval_dataset,
|
|
||||||
compute_metrics=build_compute_metrics_fn(task_name))
|
|
||||||
|
|
||||||
return tune_trainer
|
|
||||||
|
|
||||||
|
|
||||||
def recover_checkpoint(tune_checkpoint_dir, model_name=None):
|
|
||||||
if tune_checkpoint_dir is None or len(tune_checkpoint_dir) == 0:
|
|
||||||
return model_name
|
|
||||||
# Get subdirectory used for Huggingface.
|
|
||||||
subdirs = [
|
|
||||||
os.path.join(tune_checkpoint_dir, name)
|
|
||||||
for name in os.listdir(tune_checkpoint_dir)
|
|
||||||
if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
|
|
||||||
]
|
|
||||||
# There should only be 1 subdir.
|
|
||||||
assert len(subdirs) == 1, subdirs
|
|
||||||
return subdirs[0]
|
|
||||||
|
|
||||||
|
|
||||||
# __train_begin__
|
|
||||||
# Uncomment this line to use W&B!
|
|
||||||
# @wandb_mixin
|
|
||||||
def train_transformer(config, checkpoint_dir=None):
|
|
||||||
data_args = DataTrainingArguments(
|
|
||||||
task_name=config["task_name"], data_dir=config["data_dir"])
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
|
|
||||||
train_dataset = GlueDataset(
|
|
||||||
data_args,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
mode="train",
|
|
||||||
cache_dir=config["data_dir"])
|
|
||||||
eval_dataset = GlueDataset(
|
|
||||||
data_args,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
mode="dev",
|
|
||||||
cache_dir=config["data_dir"])
|
|
||||||
eval_dataset = eval_dataset[:len(eval_dataset) // 2]
|
|
||||||
training_args = TrainingArguments(
|
|
||||||
output_dir=tune.get_trial_dir(),
|
|
||||||
learning_rate=config["learning_rate"],
|
|
||||||
do_train=True,
|
|
||||||
do_eval=True,
|
|
||||||
evaluate_during_training=True,
|
|
||||||
eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
|
|
||||||
1,
|
|
||||||
# We explicitly set save to 0, and do saving in evaluate instead
|
|
||||||
save_steps=0,
|
|
||||||
num_train_epochs=config["num_epochs"],
|
|
||||||
max_steps=config["max_steps"],
|
|
||||||
per_device_train_batch_size=config["per_gpu_train_batch_size"],
|
|
||||||
per_device_eval_batch_size=config["per_gpu_val_batch_size"],
|
|
||||||
warmup_steps=0,
|
|
||||||
weight_decay=config["weight_decay"],
|
|
||||||
logging_dir="./logs",
|
|
||||||
)
|
|
||||||
|
|
||||||
tune_trainer = get_trainer(
|
|
||||||
recover_checkpoint(checkpoint_dir, config["model_name"]),
|
|
||||||
train_dataset, eval_dataset, config["task_name"], training_args)
|
|
||||||
tune_trainer.train(
|
|
||||||
recover_checkpoint(checkpoint_dir, config["model_name"]))
|
|
||||||
|
|
||||||
|
|
||||||
# __train_end__
|
|
||||||
|
|
||||||
|
|
||||||
# __tune_begin__
|
|
||||||
def tune_transformer(num_samples=8,
|
def tune_transformer(num_samples=8,
|
||||||
gpus_per_trial=0,
|
gpus_per_trial=0,
|
||||||
smoke_test=False,
|
smoke_test=False,
|
||||||
ray_address=None):
|
ray_address=None):
|
||||||
ray.init(ray_address, log_to_driver=False)
|
ray.init(ray_address, log_to_driver=True)
|
||||||
data_dir_name = "./data" if not smoke_test else "./test_data"
|
data_dir_name = "./data" if not smoke_test else "./test_data"
|
||||||
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
|
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
|
||||||
if not os.path.exists(data_dir):
|
if not os.path.exists(data_dir):
|
||||||
|
@ -119,34 +32,73 @@ def tune_transformer(num_samples=8,
|
||||||
|
|
||||||
task_data_dir = os.path.join(data_dir, task_name.upper())
|
task_data_dir = os.path.join(data_dir, task_name.upper())
|
||||||
|
|
||||||
|
num_labels = glue_tasks_num_labels[task_name]
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(
|
||||||
|
model_name, num_labels=num_labels, finetuning_task=task_name)
|
||||||
|
|
||||||
# Download and cache tokenizer, model, and features
|
# Download and cache tokenizer, model, and features
|
||||||
print("Downloading and caching Tokenizer")
|
print("Downloading and caching Tokenizer")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
# Triggers tokenizer download to cache
|
# Triggers tokenizer download to cache
|
||||||
AutoTokenizer.from_pretrained(model_name)
|
|
||||||
print("Downloading and caching pre-trained model")
|
print("Downloading and caching pre-trained model")
|
||||||
|
AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
|
||||||
# Triggers model download to cache
|
def get_model():
|
||||||
AutoModelForSequenceClassification.from_pretrained(model_name)
|
return AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
|
||||||
# Download data.
|
# Download data.
|
||||||
download_data(task_name, data_dir)
|
download_data(task_name, data_dir)
|
||||||
|
|
||||||
config = {
|
data_args = GlueDataTrainingArguments(
|
||||||
"model_name": model_name,
|
task_name=task_name, data_dir=task_data_dir)
|
||||||
"task_name": task_name,
|
|
||||||
"data_dir": task_data_dir,
|
train_dataset = GlueDataset(
|
||||||
"per_gpu_val_batch_size": 32,
|
data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir)
|
||||||
"per_gpu_train_batch_size": tune.choice([16, 32, 64]),
|
eval_dataset = GlueDataset(
|
||||||
"learning_rate": tune.uniform(1e-5, 5e-5),
|
data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir)
|
||||||
"weight_decay": tune.uniform(0.0, 0.3),
|
|
||||||
"num_epochs": tune.choice([2, 3, 4, 5]),
|
training_args = TrainingArguments(
|
||||||
|
output_dir=".",
|
||||||
|
learning_rate=1e-5, # config
|
||||||
|
do_train=True,
|
||||||
|
do_eval=True,
|
||||||
|
evaluate_during_training=True,
|
||||||
|
eval_steps=(len(train_dataset) // 16) + 1
|
||||||
|
if not smoke_test else 1, # config
|
||||||
|
save_steps=(len(train_dataset) // 16) + 1
|
||||||
|
if not smoke_test else 1, # config,
|
||||||
|
num_train_epochs=2, # config
|
||||||
|
max_steps=-1,
|
||||||
|
per_device_train_batch_size=16, # config
|
||||||
|
per_device_eval_batch_size=16, # config
|
||||||
|
warmup_steps=0,
|
||||||
|
weight_decay=0.1, # config
|
||||||
|
logging_dir="./logs",
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model_init=get_model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
compute_metrics=build_compute_metrics_fn(task_name))
|
||||||
|
|
||||||
|
tune_config = {
|
||||||
|
"per_device_eval_batch_size": 32,
|
||||||
|
"eval_steps": tune.sample_from(
|
||||||
|
lambda spec: len(train_dataset) // spec.config["per_device_train_batch_size"] + 1 # noqa: E501
|
||||||
|
) if not smoke_test else 1,
|
||||||
|
"save_steps": tune.sample_from(lambda spec: spec.config["eval_steps"]),
|
||||||
|
"num_train_epochs": tune.choice([2, 3, 4, 5]),
|
||||||
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
|
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
|
||||||
"wandb": {
|
|
||||||
"project": "pbt_transformers",
|
|
||||||
"reinit": True,
|
|
||||||
"allow_val_change": True
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
scheduler = PopulationBasedTraining(
|
scheduler = PopulationBasedTraining(
|
||||||
|
@ -157,70 +109,36 @@ def tune_transformer(num_samples=8,
|
||||||
hyperparam_mutations={
|
hyperparam_mutations={
|
||||||
"weight_decay": tune.uniform(0.0, 0.3),
|
"weight_decay": tune.uniform(0.0, 0.3),
|
||||||
"learning_rate": tune.uniform(1e-5, 5e-5),
|
"learning_rate": tune.uniform(1e-5, 5e-5),
|
||||||
"per_gpu_train_batch_size": [16, 32, 64],
|
"per_device_train_batch_size": [16, 32, 64],
|
||||||
})
|
})
|
||||||
|
|
||||||
reporter = CLIReporter(
|
reporter = CLIReporter(
|
||||||
parameter_columns={
|
parameter_columns={
|
||||||
"weight_decay": "w_decay",
|
"weight_decay": "w_decay",
|
||||||
"learning_rate": "lr",
|
"learning_rate": "lr",
|
||||||
"per_gpu_train_batch_size": "train_bs/gpu",
|
"per_device_train_batch_size": "train_bs/gpu",
|
||||||
"num_epochs": "num_epochs"
|
"num_epochs": "num_epochs"
|
||||||
},
|
},
|
||||||
metric_columns=[
|
metric_columns=[
|
||||||
"eval_acc", "eval_loss", "epoch", "training_iteration"
|
"eval_acc", "eval_loss", "epoch", "training_iteration"
|
||||||
])
|
])
|
||||||
|
|
||||||
analysis = tune.run(
|
trainer.hyperparameter_search(
|
||||||
train_transformer,
|
hp_space=lambda _: tune_config,
|
||||||
|
backend="ray",
|
||||||
|
n_trials=num_samples,
|
||||||
resources_per_trial={
|
resources_per_trial={
|
||||||
"cpu": 1,
|
"cpu": 1,
|
||||||
"gpu": gpus_per_trial
|
"gpu": gpus_per_trial
|
||||||
},
|
},
|
||||||
config=config,
|
|
||||||
num_samples=num_samples,
|
|
||||||
scheduler=scheduler,
|
scheduler=scheduler,
|
||||||
keep_checkpoints_num=3,
|
keep_checkpoints_num=3,
|
||||||
checkpoint_score_attr="training_iteration",
|
checkpoint_score_attr="training_iteration",
|
||||||
stop={"training_iteration": 1} if smoke_test else None,
|
stop={"training_iteration": 1} if smoke_test else None,
|
||||||
progress_reporter=reporter,
|
progress_reporter=reporter,
|
||||||
local_dir="~/ray_results/",
|
local_dir="~/ray_results/",
|
||||||
name="tune_transformer_pbt")
|
name="tune_transformer_pbt",
|
||||||
|
log_to_file=True)
|
||||||
if not smoke_test:
|
|
||||||
test_best_model(analysis, config["model_name"], config["task_name"],
|
|
||||||
config["data_dir"])
|
|
||||||
|
|
||||||
|
|
||||||
# __tune_end__
|
|
||||||
|
|
||||||
|
|
||||||
def test_best_model(analysis, model_name, task_name, data_dir):
|
|
||||||
data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
|
|
||||||
best_config = analysis.get_best_config(metric="eval_acc", mode="max")
|
|
||||||
print(best_config)
|
|
||||||
best_checkpoint = recover_checkpoint(
|
|
||||||
analysis.get_best_trial(metric="eval_acc",
|
|
||||||
mode="max").checkpoint.value)
|
|
||||||
print(best_checkpoint)
|
|
||||||
best_model = AutoModelForSequenceClassification.from_pretrained(
|
|
||||||
best_checkpoint).to("cuda")
|
|
||||||
|
|
||||||
test_args = TrainingArguments(output_dir="./best_model_results", )
|
|
||||||
test_dataset = GlueDataset(
|
|
||||||
data_args, tokenizer=tokenizer, mode="dev", cache_dir=data_dir)
|
|
||||||
test_dataset = test_dataset[len(test_dataset) // 2:]
|
|
||||||
|
|
||||||
test_trainer = Trainer(
|
|
||||||
best_model,
|
|
||||||
test_args,
|
|
||||||
compute_metrics=build_compute_metrics_fn(task_name))
|
|
||||||
|
|
||||||
metrics = test_trainer.evaluate(test_dataset)
|
|
||||||
print(metrics)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,52 +0,0 @@
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from typing import Dict, Optional, Tuple
|
|
||||||
|
|
||||||
from ray import tune
|
|
||||||
|
|
||||||
import transformers
|
|
||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from torch.utils.data import Dataset
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
"""A Trainer class integrated with Tune.
|
|
||||||
The only changes to the original transformers.Trainer are:
|
|
||||||
- Report eval metrics to Tune
|
|
||||||
- Save state using Tune's checkpoint directories
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class TuneTransformerTrainer(transformers.Trainer):
|
|
||||||
def get_optimizers(
|
|
||||||
self, num_training_steps: int
|
|
||||||
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
|
|
||||||
self.current_optimizer, self.current_scheduler = super(
|
|
||||||
).get_optimizers(num_training_steps)
|
|
||||||
return (self.current_optimizer, self.current_scheduler)
|
|
||||||
|
|
||||||
def evaluate(self,
|
|
||||||
eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
|
|
||||||
eval_dataloader = self.get_eval_dataloader(eval_dataset)
|
|
||||||
output = self._prediction_loop(
|
|
||||||
eval_dataloader, description="Evaluation")
|
|
||||||
self._log(output.metrics)
|
|
||||||
self.save_state()
|
|
||||||
tune.report(**output.metrics)
|
|
||||||
|
|
||||||
return output.metrics
|
|
||||||
|
|
||||||
def save_state(self):
|
|
||||||
with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
|
|
||||||
self.args.output_dir = checkpoint_dir
|
|
||||||
# This is the directory name that Huggingface requires.
|
|
||||||
output_dir = os.path.join(
|
|
||||||
self.args.output_dir,
|
|
||||||
f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
|
|
||||||
self.save_model(output_dir)
|
|
||||||
if self.is_world_master():
|
|
||||||
torch.save(self.current_optimizer.state_dict(),
|
|
||||||
os.path.join(output_dir, "optimizer.pt"))
|
|
||||||
torch.save(self.current_scheduler.state_dict(),
|
|
||||||
os.path.join(output_dir, "scheduler.pt"))
|
|
|
@ -25,7 +25,8 @@ tensorflow-probability
|
||||||
timm
|
timm
|
||||||
torch>=1.5.0
|
torch>=1.5.0
|
||||||
torchvision>=0.6.0
|
torchvision>=0.6.0
|
||||||
transformers
|
# transformers
|
||||||
|
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
|
||||||
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
|
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
|
||||||
wandb
|
wandb
|
||||||
xgboost
|
xgboost
|
||||||
|
|
Loading…
Add table
Reference in a new issue