EADST

ms-swift LoRA SFT 训练脚本:Qwen-3.5-9B

ms-swift LoRA SFT 训练脚本:Qwen-3.5-9B,Len1024,Batch128

完整脚本

#!/usr/bin/env bash
set -euo pipefail

# ms-swift LoRA SFT launch script: max_length=1024, effective batch=128
# Usage:
#   bash run_ms_swift_lora_sft_len1024_bs128.sh
#
# Default effective batch on 8 GPUs:
#   8 GPUs * per_device_train_batch_size 2 * gradient_accumulation_steps 8 = 128 samples / optimizer step
#
# Optional overrides:
#   CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 PER_DEVICE_TRAIN_BATCH_SIZE=2 GRADIENT_ACCUMULATION_STEPS=16 bash run_ms_swift_lora_sft_len1024_bs128.sh

WORK_DIR="/root/paddlejob/workspace/env_run/output/dong/sft"
MODEL_PATH="/root/paddlejob/workspace/env_run/output/model/qwen-3.5-9b"
RAW_TRAIN_DATASET="${WORK_DIR}/sft_train_sharegpt.json"
RAW_VAL_DATASET="${WORK_DIR}/sft_val_sharegpt.json"
DATA_DIR="${WORK_DIR}/ms_swift_data"
TRAIN_DATASET="${DATA_DIR}/sft_train_messages.jsonl"
VAL_DATASET="${DATA_DIR}/sft_val_messages.jsonl"
OUTPUT_ROOT="${WORK_DIR}/outputs/qwen-3.5-9b-lora-sft-len1024-bs128"
PYTHON_BIN="${PYTHON_BIN:-$(command -v python)}"
SWIFT_BIN="${SWIFT_BIN:-$(command -v swift || true)}"

# SwanLab
export SWANLAB_API_KEY="${SWANLAB_API_KEY:-YOUR_SWANLAB_API_KEY}"
SWANLAB_PROJECT="${SWANLAB_PROJECT:-qwen-3.5-9b-sft}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d-%H%M%S)}"

# GPU / distributed settings
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"

# Training hyperparameters. Defaults target 8xA800 with effective batch size 128.
NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-3}"
MAX_STEPS="${MAX_STEPS:-1000}"
PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-2}"
PER_DEVICE_EVAL_BATCH_SIZE="${PER_DEVICE_EVAL_BATCH_SIZE:-2}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-8}"
MAX_LENGTH="${MAX_LENGTH:-1024}"
LORA_DROPOUT="${LORA_DROPOUT:-0.1}"
EVAL_STEPS="${EVAL_STEPS:-100}"
SAVE_STEPS="${SAVE_STEPS:-100}"
SAVE_TOTAL_LIMIT="${SAVE_TOTAL_LIMIT:-5}"
LORA_LR_GRID="${LORA_LR_GRID:-16:3e-5 16:5e-5}"

cd "${WORK_DIR}"
mkdir -p "${OUTPUT_ROOT}"

if [[ -z "${PYTHON_BIN}" || ! -x "${PYTHON_BIN}" ]]; then
  echo "ERROR: python executable not found: ${PYTHON_BIN}" >&2
  exit 1
fi

if [[ -z "${SWIFT_BIN}" || ! -x "${SWIFT_BIN}" ]]; then
  cat >&2 <<'MSG'
ERROR: swift command not found.
Please install ms-swift and swanlab in the current environment first.
MSG
  exit 1
fi

SWIFT_SOURCE_DIR="$(${PYTHON_BIN} - <<'PY'
import importlib.util
import sys
spec = importlib.util.find_spec('swift')
if spec is None or not spec.origin:
    sys.exit(1)
print(spec.origin)
PY
)" || {
  cat >&2 <<MSG
ERROR: Python cannot import module 'swift'.
The current swift executable is: ${SWIFT_BIN}
The current Python executable is: ${PYTHON_BIN}
MSG
  exit 1
}

echo "Using Python: ${PYTHON_BIN}"
echo "Using swift: ${SWIFT_BIN}"
echo "Using swift module: ${SWIFT_SOURCE_DIR}"
echo "Effective batch size: $((NPROC_PER_NODE * PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))"

if [[ ! -d "${MODEL_PATH}" ]]; then
  echo "ERROR: model path not found: ${MODEL_PATH}" >&2
  exit 1
fi

if [[ ! -f "${RAW_TRAIN_DATASET}" ]]; then
  echo "ERROR: raw train dataset not found: ${RAW_TRAIN_DATASET}" >&2
  exit 1
fi

if [[ ! -f "${RAW_VAL_DATASET}" ]]; then
  echo "ERROR: raw validation dataset not found: ${RAW_VAL_DATASET}" >&2
  exit 1
fi

"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_TRAIN_DATASET}" --output "${TRAIN_DATASET}"
"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_VAL_DATASET}" --output "${VAL_DATASET}"

TRAIN_TYPE_FLAG="--train_type"
if ! "${SWIFT_BIN}" sft --help 2>/dev/null | grep -q -- "--train_type"; then
  TRAIN_TYPE_FLAG="--tuner_type"
fi

for item in ${LORA_LR_GRID}; do
  IFS=":" read -r LORA_RANK LEARNING_RATE <<< "${item}"
  if [[ -z "${LORA_RANK}" || -z "${LEARNING_RATE}" ]]; then
    echo "ERROR: invalid LORA_LR_GRID item: ${item}" >&2
    exit 1
  fi

  LORA_ALPHA="$((LORA_RANK * 2))"
  RUN_NAME="qwen-3.5-9b-len${MAX_LENGTH}-bs128-lora-r${LORA_RANK}-lr${LEARNING_RATE}-drop${LORA_DROPOUT}-${RUN_TAG}"
  OUTPUT_DIR="${OUTPUT_ROOT}/${RUN_NAME}"
  mkdir -p "${OUTPUT_DIR}"

  echo "Starting SFT run: ${RUN_NAME}"
  "${SWIFT_BIN}" sft \
    --model "${MODEL_PATH}" \
    "${TRAIN_TYPE_FLAG}" lora \
    --dataset "${TRAIN_DATASET}" \
    --val_dataset "${VAL_DATASET}" \
    --torch_dtype bfloat16 \
    --num_train_epochs "${NUM_TRAIN_EPOCHS}" \
    --max_steps "${MAX_STEPS}" \
    --per_device_train_batch_size "${PER_DEVICE_TRAIN_BATCH_SIZE}" \
    --per_device_eval_batch_size "${PER_DEVICE_EVAL_BATCH_SIZE}" \
    --gradient_accumulation_steps "${GRADIENT_ACCUMULATION_STEPS}" \
    --learning_rate "${LEARNING_RATE}" \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.03 \
    --max_length "${MAX_LENGTH}" \
    --truncation_strategy delete \
    --gradient_checkpointing true \
    --packing false \
    --lora_rank "${LORA_RANK}" \
    --lora_alpha "${LORA_ALPHA}" \
    --lora_dropout "${LORA_DROPOUT}" \
    --target_modules all-linear \
    --eval_steps "${EVAL_STEPS}" \
    --save_steps "${SAVE_STEPS}" \
    --save_total_limit "${SAVE_TOTAL_LIMIT}" \
    --logging_steps 5 \
    --dataloader_num_workers 4 \
    --output_dir "${OUTPUT_DIR}" \
    --report_to swanlab \
    --swanlab_project "${SWANLAB_PROJECT}" \
    --swanlab_exp_name "${RUN_NAME}" \
    --model_name "${RUN_NAME}" \
    --model_author dong
done

脚本目标

这个脚本用于在 Qwen-3.5-9B 上进行 ms-swift LoRA SFT 训练。当前实验固定两个核心条件:

| 配置项 | 当前值 |
| --- | --- |
| max_length | 1024 |
| effective batch size | 128 |
| LoRA dropout | 0.1 |
| LoRA rank | 16 |
| learning rate | 3e-5 / 5e-5 |

它的定位不是单次训练命令,而是一个可复用的实验启动器:负责数据转换、环境检查、参数网格遍历、输出目录隔离和 SwanLab 上报。

当前会跑的实验

脚本里最关键的网格参数是:

LORA_LR_GRID="${LORA_LR_GRID:-16:3e-5 16:5e-5}"

每个 item 的格式是:

LoRA rank:learning rate

因此默认会顺序执行两组实验:

| 实验 | LoRA Rank | Learning Rate | Dropout | Max Length | Effective Batch |
| --- | --- | --- | --- | --- | --- |
| Run 1 | 16 | 3e-5 | 0.1 | 1024 | 128 |
| Run 2 | 16 | 5e-5 | 0.1 | 1024 | 128 |

第一组训练完成后,脚本会继续启动第二组。两组实验会进入不同的输出目录,并分别上报 SwanLab,后续可以直接对比训练曲线和验证集表现。

Batch Size 的计算方式

默认情况下脚本面向 8 卡训练:

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NPROC_PER_NODE=8
PER_DEVICE_TRAIN_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=8

有效 batch size 的计算方式是:

8 GPUs * 2 samples/GPU * 8 gradient accumulation steps = 128 samples / optimizer step

如果只使用 4 张 GPU,也可以通过环境变量覆盖参数,同时保持有效 batch size 不变:

CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
PER_DEVICE_TRAIN_BATCH_SIZE=2 \
GRADIENT_ACCUMULATION_STEPS=16 \
bash run_ms_swift_lora_sft_len1024_bs128.sh

数据准备

脚本使用的原始数据是 ShareGPT 格式:

RAW_TRAIN_DATASET="${WORK_DIR}/sft_train_sharegpt.json"
RAW_VAL_DATASET="${WORK_DIR}/sft_val_sharegpt.json"

训练前会先转换成 ms-swift 使用的 messages jsonl:

TRAIN_DATASET="${DATA_DIR}/sft_train_messages.jsonl"
VAL_DATASET="${DATA_DIR}/sft_val_messages.jsonl"

转换命令是:

"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_TRAIN_DATASET}" --output "${TRAIN_DATASET}"
"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_VAL_DATASET}" --output "${VAL_DATASET}"

这里单独做数据转换是有必要的。它把原始数据格式处理和训练过程解耦,后续如果训练出错,也更容易判断问题出在数据侧还是训练侧。

训练参数

核心训练命令使用 swift sft,训练类型是 LoRA:

--train_type lora
--target_modules all-linear
--lora_rank "${LORA_RANK}"
--lora_alpha "${LORA_ALPHA}"
--lora_dropout "${LORA_DROPOUT}"

其中 lora_alpha 会按 rank 自动计算:

LORA_ALPHA="$((LORA_RANK * 2))"

当前 rank 为 16,所以 alpha 为 32

主要训练参数如下:

| 参数 | 配置 |
| --- | --- |
| num_train_epochs | 3 |
| max_steps | 1000 |
| max_length | 1024 |
| per_device_train_batch_size | 2 |
| per_device_eval_batch_size | 2 |
| gradient_accumulation_steps | 8 |
| lr_scheduler_type | cosine |
| warmup_ratio | 0.03 |
| torch_dtype | bfloat16 |
| gradient_checkpointing | true |
| packing | false |
| eval_steps | 100 |
| save_steps | 100 |
| save_total_limit | 5 |
| logging_steps | 5 |

其中 max_steps=1000 是一个硬上限。即使设置了 num_train_epochs=3,训练也会在达到 1000 step 后停止。

后台运行

带时间戳日志:

cd /root/paddlejob/workspace/env_run/output/dong/sft
nohup bash run_ms_swift_lora_sft_len1024_bs128.sh > train_len1024_bs128_$(date +%Y%m%d_%H%M%S).log 2>&1 &

固定日志文件名:

cd /root/paddlejob/workspace/env_run/output/dong/sft
nohup bash run_ms_swift_lora_sft_len1024_bs128.sh > train_len1024_bs128.log 2>&1 &

查看日志:

tail -f /root/paddlejob/workspace/env_run/output/dong/sft/train_len1024_bs128.log

查看进程:

ps -ef | grep run_ms_swift_lora_sft_len1024_bs128 | grep -v grep

停止训练:

pkill -f run_ms_swift_lora_sft_len1024_bs128.sh

小结

这个脚本的核心价值是把一次 LoRA SFT 实验标准化。它固定 max_length=1024 和有效 batch size 128,然后对 3e-55e-5 两个学习率做顺序对比。

从工程上看,它覆盖了训练中容易出问题的几个环节:数据转换、环境检查、版本兼容、输出目录隔离和 SwanLab 记录。后续如果继续扩展,可以把 LORA_LR_GRID 扩成更多 rank / learning rate 组合,也可以进一步把 max_length、dropout 和自动评测逻辑纳入实验流程。

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
COCO TensorRT 多进程 AI Claude 报税 v2ray 论文 Python YOLO FP16 Miniforge Rebuttal NLP MD5 Qwen2 XGBoost Gemma PDB Proxy Git FP8 WAN ResNet-50 DeepStream 域名 TTS Distillation Docker Template Domain LLM Heatmap 音频 飞书 论文速读 git-lfs tar CUDA Random Permission 净利润 Paddle SAM 搞笑 LLAMA 签证 FastAPI TensorFlow 关于博主 CLAP GoogLeNet 算法题 继承 PIP RGB CAM Web SQLite Agent tqdm uWSGI Tiktoken NLTK LaTeX Bert Quantize Video v0.dev 第一性原理 BTC Linux Animate Card 版权 Nginx Bipartite Diagram DeepSeek Hilton EXCEL SPIE Excel CV LoRA Conda Vmess CC 图形思考法 UNIX logger VGG-16 Bin Land Dataset 多线程 Review RAR Food LeetCode Jupyter Image2Text Hotel Markdown uwsgi Hungarian XML Shortcut QWEN OCR Jetson Clash Pandas Windows OpenAI Statistics Logo Translation Knowledge Search 顶会 Website Django Crawler Qwen2.5 Streamlit TSV 财报 Sklearn Quantization 阿里云 HuggingFace Color Firewall Plate Github Pytorch Password IndexTTS2 InvalidArgumentError GPT4 WebCrawler 递归学习法 ms-swift PyTorch llama.cpp diffusers Interview 云服务器 GPTQ Safetensors Google Ubuntu printf Plotly Pickle ChatGPT Ptyhon FP64 HaggingFace Augmentation Vim 强化学习 Algorithm Zip JSON Cloudreve Mixtral hf icon UI GGML transformers Anaconda CSV Disk scipy C++ Pillow Input Bitcoin 证件照 ONNX FP32 OpenCV Math PyCharm ModelScope Qwen Baidu NameSilo 图标 FlashAttention Paper Magnet 公式 News mmap SQL Michelin Transformers Data Attention git torchinfo VSCode Use PDF Tensor CEIR CTC API Llama BF16 Freesound Base64 Numpy VPN Tracking Datetime SVR GIT BeautifulSoup Breakpoint 腾讯云
站点统计

本站现有博文329篇,共被浏览858602

本站已经建立2567天!

热门文章
文章归档
回到顶部