Cha*_*ker 6 python machine-learning pytorch huggingface-transformers huggingface-datasets
我广泛浏览了互联网,拥抱脸(hf)的讨论论坛和仓库,但没有找到如何正确地与 HF 并行执行 ddp/分布式数据的端到端示例(链接在最后) 。
\n这就是我需要能够端到端运行它的能力:
\npython -m torch.distributed.launch --nproc_per_node=2 distributed_maml.py给出了这些问题的答案,我认为我可以编写自己的笔记本并广泛分享。
\n这是我想要完成的起始代码,但不确定我是否做得正确(特别是因为我不知道要更改培训师的哪些参数):
\n"""\n\n- training on multiple gpus: https://huggingface.co/docs/transformers/perf_train_gpu_many#efficient-training-on-multiple-gpus\n- data paralelism, dp vs ddp: https://huggingface.co/docs/transformers/perf_train_gpu_many#data-parallelism\n- github example: https://github.com/huggingface/transformers/tree/main/examples/pytorch#distributed-training-and-mixed-precision\n - above came from hf discuss: https://discuss.huggingface.co/t/using-transformers-with-distributeddataparallel-any-examples/10775/7\n\n\xe2\x87\xa8 Single Node / Multi-GPU\n\nModel fits onto a single GPU:\n\nDDP - Distributed DP\nZeRO - may or may not be faster depending on the situation and configuration used.\n\n...https://huggingface.co/docs/transformers/perf_train_gpu_many#scalability-strategy\n\npython -m torch.distributed.launch \\\n --nproc_per_node number_of_gpu_you_have path_to_script.py \\\n --all_arguments_of_the_script\n\npython -m torch.distributed.launch --nproc_per_node 2 main_data_parallel_ddp_pg.py\npython -m torch.distributed.launch --nproc_per_node 2 ~/ultimate-utils/tutorials_for_myself/my_hf_hugging_face_pg/main_data_parallel_ddp_pg.py\n\ne.g.\npython -m torch.distributed.launch \\\n --nproc_per_node 8 pytorch/text-classification/run_glue.py \\\n\n --model_name_or_path bert-large-uncased-whole-word-masking \\\n --task_name mnli \\\n --do_train \\\n --do_eval \\\n --max_seq_length 128 \\\n --per_device_train_batch_size 8 \\\n --learning_rate 2e-5 \\\n --num_train_epochs 3.0 \\\n --output_dir /tmp/mnli_output/\n"""\n# %%\n\n# - init group\n# - set up processes a la l2l\n# local_rank: int = local_rank: int = int(os.environ["LOCAL_RANK"]) # get_local_rank()\n# print(f\'{local_rank=}\')\n## init_process_group_l2l(args, local_rank=local_rank, world_size=args.world_size, init_method=args.init_method)\n# init_process_group_l2l bellow\n# if is_running_parallel(rank):\n# print(f\'----> setting up rank={rank} (with world_size={world_size})\')\n# # MASTER_ADDR = \'localhost\'\n# MASTER_ADDR = \'127.0.0.1\'\n# MASTER_PORT = master_port\n# # set up the master\'s ip address so this child process can coordinate\n# os.environ[\'MASTER_ADDR\'] = MASTER_ADDR\n# print(f"---> {MASTER_ADDR=}")\n# os.environ[\'MASTER_PORT\'] = MASTER_PORT\n# print(f"---> {MASTER_PORT=}")\n#\n# # - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends\n# if torch.cuda.is_available():\n# backend = \'nccl\'\n# # You need to call torch_uu.cuda.set_device(rank) before init_process_group is called. https://github.com/pytorch/pytorch/issues/54550\n# torch.cuda.set_device(\n# args.device) # is this right if we do parallel cpu? # You need to call torch_uu.cuda.set_device(rank) before init_process_group is called. https://github.com/pytorch/pytorch/issues/54550\n# print(f\'---> {backend=}\')\n# rank: int = torch.distributed.get_rank() if is_running_parallel(local_rank) else -1\n\n# https://huggingface.co/docs/transformers/tasks/translation\nimport datasets\nfrom datasets import load_dataset, DatasetDict\n\nbooks: DatasetDict = load_dataset("opus_books", "en-fr")\nprint(f\'{books=}\')\n\nbooks: DatasetDict = books["train"].train_test_split(test_size=0.2)\nprint(f\'{books=}\')\nprint(f\'{books["train"]=}\')\n\nprint(books["train"][0])\n"""\n{\'id\': \'90560\',\n \'translation\': {\'en\': \'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.\',\n \'fr\': \'Mais ce plateau \xc3\xa9lev\xc3\xa9 ne mesurait que quelques toises, et bient\xc3\xb4t nous f\xc3\xbbmes rentr\xc3\xa9s dans notre \xc3\xa9l\xc3\xa9ment.\'}}\n"""\n\n# - t5 tokenizer\n\nfrom transformers import AutoTokenizer, PreTrainedTokenizerFast, PreTrainedTokenizer\n\ntokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained("t5-small")\nprint(f\'{isinstance(tokenizer, PreTrainedTokenizer)=}\')\nprint(f\'{isinstance(tokenizer, PreTrainedTokenizerFast)=}\')\n\nsource_lang = "en"\ntarget_lang = "fr"\nprefix = "translate English to French: "\n\n\ndef preprocess_function(examples):\n inputs = [prefix + example[source_lang] for example in examples["translation"]]\n targets = [example[target_lang] for example in examples["translation"]]\n model_inputs = tokenizer(inputs, max_length=128, truncation=True)\n\n with tokenizer.as_target_tokenizer():\n labels = tokenizer(targets, max_length=128, truncation=True)\n\n model_inputs["labels"] = labels["input_ids"]\n return model_inputs\n\n\n# Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning: (hack to seep up tutorial)\nbooks[\'train\'] = books["train"].shuffle(seed=42).select(range(100))\nbooks[\'test\'] = books["test"].shuffle(seed=42).select(range(100))\n\n# # use Datasets map method to apply a preprocessing function over the entire dataset:\n# tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=2)\n\n# todo - would be nice to remove this since gpt-2/3 size you can\'t preprocess the entire data set...or can you?\n# tokenized_books = books.map(preprocess_function, batched=True, batch_size=2)\nfrom uutils.torch_uu.data_uu.hf_uu_data_preprocessing import preprocess_function_translation_tutorial\n\npreprocessor = lambda examples: preprocess_function_translation_tutorial(examples, tokenizer)\ntokenized_books = books.map(preprocessor, batched=True, batch_size=2)\nprint(f\'{tokenized_books=}\')\n\n# - load model\nfrom transformers import AutoModelForSeq2SeqLM\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained("t5-small")\n\n# - to DDP\n# model = model().to(rank)\n# from torch.nn.parallel import DistributedDataParallel as DDP\n# ddp_model = DDP(model, device_ids=[rank])\n\n# Use DataCollatorForSeq2Seq to create a batch of examples. It will also dynamically pad your text and labels to the\n# length of the longest element in its batch, so they are a uniform length.\n# While it is possible to pad your text in the tokenizer function by setting padding=True, dynamic padding is more efficient.\n\nfrom transformers import DataCollatorForSeq2Seq\n\n# Data collator that will dynamically pad the inputs received, as well as the labels.\ndata_collator: DataCollatorForSeq2Seq = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)\n\n"""\nAt this point, only three steps remain:\n\n- Define your training hyperparameters in Seq2SeqTrainingArguments.\n- Pass the training arguments to Seq2SeqTrainer along with the model, dataset, tokenizer, and data collator.\n- Call train() to fine-tune your model.\n"""\nreport_to = "none"\nif report_to != \'none\':\n import wandb\n wandb.init(project="playground", entity="brando", name=\'run_name\', group=\'expt_name\')\n\nfrom transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer\n\n# fp16 = True # cuda\n# fp16 = False # cpu\nimport torch\n\nfp16 = torch.cuda.is_available() # True for cuda, false for cpu\ntraining_args = Seq2SeqTrainingArguments(\n output_dir="./results",\n evaluation_strategy="epoch",\n learning_rate=2e-5,\n per_device_train_batch_size=16,\n per_device_eval_batch_size=16,\n weight_decay=0.01,\n save_total_limit=3,\n num_train_epochs=1,\n fp16=fp16,\n report_to=report_to,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=tokenized_books["train"],\n eval_dataset=tokenized_books["test"],\n tokenizer=tokenizer,\n data_collator=data_collator,\n)\n\ntrainer.train()\n\nprint(\'\\n ----- Success\\a\')\nRun Code Online (Sandbox Code Playgroud)\n我在写这个问题时查阅的所有参考文献:
\n您无需设置任何内容,只需执行以下操作:
python -m torch.distributed.launch --nproc_per_node 2 ~/src/main_debug.py
Run Code Online (Sandbox Code Playgroud)
或者
torchrun --nproc_per_node=2 --nnodes=2 --use_env ~/src/main_debug.py
Run Code Online (Sandbox Code Playgroud)
以羊驼为例:
torchrun --nproc_per_node=4 --master_port=<your_random_port> train.py \
--model_name_or_path <your_path_to_hf_converted_llama_ckpt_and_tokenizer> \
--data_path ./alpaca_data.json \
--bf16 True \
--output_dir <your_output_dir> \
--num_train_epochs 3 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 8 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 2000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--fsdp "full_shard auto_wrap" \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--tf32 True
Run Code Online (Sandbox Code Playgroud)
参考: https: //github.com/tatsu-lab/stanford_alpaca#fine-tuning
| 归档时间: |
|
| 查看次数: |
2079 次 |
| 最近记录: |