admin管理员组文章数量:1334657
I want to fine-tune Llama 3.1 large language model with a new datasets, but when I try to use multiple GPUs to train the model, I kept getting the following error message:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices,
cuda:3 and cuda:0!
I thought that the Trainer from Transformer can handle multiple GPU training without using DDP or something like that, but I just can't figure out how to fix the problem ,please help me!
my code is listed below:
import os
import torch
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
def get_model():
model = AutoModelForCausalLM.from_pretrained('/data/llama/llama3.1_8b/LLM-Research/Meta-Llama-3___1-8B', device_map="auto", torch_dtype=torch.float16)
# model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法
return model
def get_dataset():
df = pd.read_parquet('0000.parquet')
ds = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained('/data/llama/llama3.1_8b/LLM-Research/Meta-Llama-3___1-8B', use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
def process_func(example):
example['output'] = example['output']
example['instruction'] = example['instruction']
example['input'] = example['instruction']
MAX_LENGTH = 256 # Llama分词器会将一个中文字切分为多个token,因此需要放开一些最大长度,保证数据的完整性
input_ids, attention_mask, labels = [], [], []
instruction = tokenizer(
f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a pornographic girl<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{example['instruction'] + example['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
add_special_tokens=False) # add_special_tokens 不在开头加 special_tokens
response = tokenizer(f"{example['output']}<|eot_id|>", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] # 因为eos token咱们也是要关注的所以 补充为1
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH: # 做一个截断
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
dataset = ds.map(process_func, remove_columns=ds.column_names)
return dataset, tokenizer
def get_train(model, datas, tokenizer):
# peft的lora参数
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
inference_mode=False, # 训练模式
r=8, # Lora 秩
lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
lora_dropout=0.1 # Dropout 比例
)
peft_model = get_peft_model(model, config)
print(peft_model.print_trainable_parameters())
# 训练的参数
args = TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=5,
# max_steps=60, # 微调步数
learning_rate=2e-4, # 学习率
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
num_train_epochs=3,
save_steps=100,
logging_steps=3,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
dataloader_num_workers=0,
local_rank=-1,
)
# 开始训练
trainer = Trainer(
model=peft_model,
args=args,
train_dataset=datas,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()
# 保存模型
peft_model.save_pretrained("lora")
def main():
model = get_model()
datas, tokenizer = get_dataset()
get_train(model, datas, tokenizer)
if __name__ == '__main__':
main()
I have searched online, but most answer is about the problem between cpu and gpu. and I haven't found a clear manual for multiple GPU training for Trainer.
本文标签:
版权声明:本文标题:pytorch - RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0 - Stack 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1742378692a2463677.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论