Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support effective tokens calculation on sft/dpo #6078

Merged
merged 4 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/llamafactory/train/dpo/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from typing import TYPE_CHECKING, List, Optional

import torch.distributed as dist

from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
from ...extras.constants import IGNORE_INDEX
from ...extras.ploting import plot_loss
Expand Down Expand Up @@ -64,6 +66,11 @@ def run_dpo(
# Update arguments
training_args.remove_unused_columns = False # important for multimodal and pairwise dataset

effective_token_num = 0.0
for data in dataset_module["train_dataset"]:
effective_token_num += len(data["chosen_input_ids"])
effective_token_num += len(data["rejected_input_ids"])

# Initialize our Trainer
trainer = CustomDPOTrainer(
model=model,
Expand All @@ -79,6 +86,14 @@ def run_dpo(
# Training
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
train_result.metrics["effective_tokens_per_sec"] = (
effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
)
if dist.is_initialized():
train_result.metrics["effective_tokens_per_sec"] = (
train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
)

trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
Expand Down
14 changes: 14 additions & 0 deletions src/llamafactory/train/sft/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from typing import TYPE_CHECKING, List, Optional

import torch.distributed as dist

from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
from ...extras.constants import IGNORE_INDEX
from ...extras.misc import get_logits_processor
Expand Down Expand Up @@ -65,6 +67,10 @@ def run_sft(
training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
training_args.remove_unused_columns = False # important for multimodal dataset

effective_token_num = 0.0
for data in dataset_module["train_dataset"]:
effective_token_num += len(data["input_ids"])

# Metric utils
metric_module = {}
if training_args.predict_with_generate:
Expand Down Expand Up @@ -94,6 +100,14 @@ def run_sft(
# Training
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
train_result.metrics["effective_tokens_per_sec"] = (
effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
)
if dist.is_initialized():
train_result.metrics["effective_tokens_per_sec"] = (
train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
)

trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
Expand Down