跳转至

标注(文本与多模态)

文本 QA 全流程

流程:分块 → 生成领域树(可交互/自定义)→ 生成问题 → 标签匹配 → 生成答案 → JSONL 保存。

快速示例

from datamax import DataMax

dm = DataMax(file_path="a.pdf", to_markdown=True, use_mineru=True)
qa = dm.get_pre_label(
    api_key="${DASHSCOPE_API_KEY}",
    base_url="${DASHSCOPE_BASE_URL}",   # 未带 /chat/completions 会自动补全
    model_name="qwen-max",
    question_number=8,
    max_qps=5.0,
    use_tree_label=True,       # 使用领域树
    interactive_tree=False     # 交互式修订(可选)
)
dm.save_label_data(qa, "train")  # 生成 train.jsonl

自定义领域树

custom_tree = [
  {"label": "1 概述", "child": [{"label": "1.1 背景"}, {"label": "1.2 术语"}]},
  {"label": "2 方法"}
]

qa = dm.get_pre_label(
  api_key="...", base_url="...", model_name="...",
  use_tree_label=True, interactive_tree=False,
  custom_domain_tree=custom_tree
)

以纯文本为输入(无需文件)

from datamax.generator import full_qa_labeling_process

result = full_qa_labeling_process(
  content="你的长文本...",
  api_key="...", base_url="...", model_name="qwen-max",
  chunk_size=500, chunk_overlap=100,
  question_number=6, max_qps=5.0,
  use_tree_label=True, debug=False
)
qa_pairs = result.get("qa_pairs", [])

多模态 QA(Markdown + 图片)

当你的 Markdown 中包含图片(![]()),可生成图文对话式 QA。

from datamax.generator import generate_multimodal_qa_pairs

qa = generate_multimodal_qa_pairs(
  file_path="with_images.md",
  api_key="${OPENAI_API_KEY}",
  model_name="gpt-4o",
  question_number=2,
  max_qps=5.0
)

接口说明(要点)

  • 兼容 OpenAI /chat/completions 接口;base_url 未含该路径时会自动补全
  • question_number 控制每段生成问题数量;并发通过 max_qps(每秒请求数上限)控制
  • save_label_data 输出 *.jsonl,可直接用于训练

示例脚本

文本 QA

""" Generate QA pairs from text with domain tree labeling. Requires DASHSCOPE_API_KEY/DASHSCOPE_BASE_URL or provide explicitly. Set QA_INPUT_SOURCE=obs with OBS_* credentials to pull inputs from Huawei OBS. """

import os from pathlib import Path

from datamax import DataMax from datamax.loader.core import DataLoader

api_key = os.getenv("DASHSCOPE_API_KEY", "YOUR OWN KEY") base_url = os.getenv("DASHSCOPE_BASE_URL", "YOUR BASE URL") model = os.getenv("QA_MODEL", "YOUR QA MODEL") qa_input_source = os.getenv("QA_INPUT_SOURCE", "local").lower() obs_endpoint = os.getenv("OBS_ENDPOINT") obs_access_key = os.getenv("OBS_ACCESS_KEY_ID") obs_secret_key = os.getenv("OBS_ACCESS_KEY_SECRET") obs_bucket_name = os.getenv("OBS_BUCKET_NAME") obs_download_dir_env = os.getenv("OBS_DOWNLOAD_DIR") obs_prefix = os.getenv("OBS_PREFIX", "")

root_dir = Path(os.getenv("DATAMAX_ROOT", "/mnt/f/datamax")) if not root_dir.is_absolute(): root_dir = Path(file).resolve().parents[2] / root_dir

train_dir_name = "train" local_dataset_dir = root_dir / "data" / "Step1" default_obs_download_dir = root_dir / "obs_downloads"

save_parent_path = root_dir / train_dir_name

def discover_local_files() -> list[Path]: if not local_dataset_dir.exists(): return [] return sorted(path for path in local_dataset_dir.rglob("*") if path.is_file())

def download_files_from_obs() -> list[Path]: missing = [ name for name, value in { "OBS_ENDPOINT": obs_endpoint, "OBS_ACCESS_KEY_ID": obs_access_key, "OBS_ACCESS_KEY_SECRET": obs_secret_key, "OBS_BUCKET_NAME": obs_bucket_name, }.items() if not value ] if missing: raise SystemExit( f"Missing OBS configuration for generate_qa: {', '.join(missing)}" )

if obs_download_dir_env:
    download_dir = Path(obs_download_dir_env)
    if not download_dir.is_absolute():
        download_dir = root_dir / download_dir
else:
    download_dir = default_obs_download_dir

loader = DataLoader(
    endpoint=obs_endpoint,
    secret_key=obs_secret_key,
    access_key=obs_access_key,
    bucket_name=obs_bucket_name,
    source="obs",
)
loader.download_path = str(download_dir)
files = loader.load_from_obs_source(obs_prefix)

resolved_files = []
for file_path in files:
    resolved_files.append(Path(file_path).resolve())
return sorted(resolved_files)

def resolve_input_files() -> list[Path]: if qa_input_source == "obs": return download_files_from_obs() if qa_input_source in {"", "local"}: return discover_local_files() raise SystemExit(f"Unsupported QA_INPUT_SOURCE value: {qa_input_source}")

def main() -> None: save_parent_path.mkdir(parents=True, exist_ok=True)

input_files = resolve_input_files()
if not input_files:
    raise SystemExit("No input files found for QA generation.")

for input_file in input_files:
    input_file = input_file.resolve()
    try:
        relative_path = input_file.relative_to(root_dir)
    except ValueError:
        relative_path = Path(input_file.name)

    relative_stem = relative_path.with_suffix("")
    save_dir = save_parent_path / relative_stem.parent
    save_dir.mkdir(parents=True, exist_ok=True)
    save_path = save_dir / f"{relative_stem.name}_train"

    dm = DataMax(file_path=str(input_file), to_markdown=True)
    data = dm.get_data()

    content = data.get("content")

    qa = dm.get_pre_label(
        content=content,
        api_key=api_key,
        base_url=base_url,
        model_name=model,
        question_number=50,  # question_number_per_chunk
        max_qps=100.0,
        debug=False,
        structured_data=True,  # enable structured output
        auto_self_review_mode=True,
        review_max_qps=100.0,
    )

    dm.save_label_data(qa, str(save_path))
    break

if name == "main": main()

nohup python examples/scripts/generate_qa.py > generate_qa.out 2>&1 & echo $! > generate_qa.pid

多模态 QA

""" Generate multimodal QA pairs from a Markdown file with images. Requires OPENAI_API_KEY or provide explicitly. """ import os from datamax.generator import generate_multimodal_qa_pairs

def main(): md_path = "examples/generate/example.md" # Ensure this MD contains image links api_key = os.getenv("OPENAI_API_KEY", "your-api-key") model = os.getenv("OPENAI_VISION_MODEL", "gpt-4o")

qa = generate_multimodal_qa_pairs(
    file_path=md_path,
    api_key=api_key,
    model_name=model,
    question_number=2,
    max_qps=5.0,
)
print(f"Generated {len(qa)} multimodal QA pairs")

if name == "main": main()