标注(文本与多模态)¶
文本 QA 全流程¶
流程:分块 → 生成领域树(可交互/自定义)→ 生成问题 → 标签匹配 → 生成答案 → JSONL 保存。
快速示例¶
from datamax import DataMax
dm = DataMax(file_path="a.pdf", to_markdown=True, use_mineru=True)
qa = dm.get_pre_label(
api_key="${DASHSCOPE_API_KEY}",
base_url="${DASHSCOPE_BASE_URL}", # 未带 /chat/completions 会自动补全
model_name="qwen-max",
question_number=8,
max_qps=5.0,
use_tree_label=True, # 使用领域树
interactive_tree=False # 交互式修订(可选)
)
dm.save_label_data(qa, "train") # 生成 train.jsonl
自定义领域树¶
custom_tree = [
{"label": "1 概述", "child": [{"label": "1.1 背景"}, {"label": "1.2 术语"}]},
{"label": "2 方法"}
]
qa = dm.get_pre_label(
api_key="...", base_url="...", model_name="...",
use_tree_label=True, interactive_tree=False,
custom_domain_tree=custom_tree
)
以纯文本为输入(无需文件)¶
from datamax.generator import full_qa_labeling_process
result = full_qa_labeling_process(
content="你的长文本...",
api_key="...", base_url="...", model_name="qwen-max",
chunk_size=500, chunk_overlap=100,
question_number=6, max_qps=5.0,
use_tree_label=True, debug=False
)
qa_pairs = result.get("qa_pairs", [])
多模态 QA(Markdown + 图片)¶
当你的 Markdown 中包含图片(![]()),可生成图文对话式 QA。
from datamax.generator import generate_multimodal_qa_pairs
qa = generate_multimodal_qa_pairs(
file_path="with_images.md",
api_key="${OPENAI_API_KEY}",
model_name="gpt-4o",
question_number=2,
max_qps=5.0
)
接口说明(要点)¶
- 兼容 OpenAI
/chat/completions接口;base_url未含该路径时会自动补全 question_number控制每段生成问题数量;并发通过max_qps(每秒请求数上限)控制save_label_data输出*.jsonl,可直接用于训练
示例脚本¶
文本 QA¶
""" Generate QA pairs from text with domain tree labeling. Requires DASHSCOPE_API_KEY/DASHSCOPE_BASE_URL or provide explicitly. Set QA_INPUT_SOURCE=obs with OBS_* credentials to pull inputs from Huawei OBS. """
import os from pathlib import Path
from datamax import DataMax from datamax.loader.core import DataLoader
api_key = os.getenv("DASHSCOPE_API_KEY", "YOUR OWN KEY") base_url = os.getenv("DASHSCOPE_BASE_URL", "YOUR BASE URL") model = os.getenv("QA_MODEL", "YOUR QA MODEL") qa_input_source = os.getenv("QA_INPUT_SOURCE", "local").lower() obs_endpoint = os.getenv("OBS_ENDPOINT") obs_access_key = os.getenv("OBS_ACCESS_KEY_ID") obs_secret_key = os.getenv("OBS_ACCESS_KEY_SECRET") obs_bucket_name = os.getenv("OBS_BUCKET_NAME") obs_download_dir_env = os.getenv("OBS_DOWNLOAD_DIR") obs_prefix = os.getenv("OBS_PREFIX", "")
root_dir = Path(os.getenv("DATAMAX_ROOT", "/mnt/f/datamax")) if not root_dir.is_absolute(): root_dir = Path(file).resolve().parents[2] / root_dir
train_dir_name = "train" local_dataset_dir = root_dir / "data" / "Step1" default_obs_download_dir = root_dir / "obs_downloads"
save_parent_path = root_dir / train_dir_name
def discover_local_files() -> list[Path]: if not local_dataset_dir.exists(): return [] return sorted(path for path in local_dataset_dir.rglob("*") if path.is_file())
def download_files_from_obs() -> list[Path]: missing = [ name for name, value in { "OBS_ENDPOINT": obs_endpoint, "OBS_ACCESS_KEY_ID": obs_access_key, "OBS_ACCESS_KEY_SECRET": obs_secret_key, "OBS_BUCKET_NAME": obs_bucket_name, }.items() if not value ] if missing: raise SystemExit( f"Missing OBS configuration for generate_qa: {', '.join(missing)}" )
if obs_download_dir_env:
download_dir = Path(obs_download_dir_env)
if not download_dir.is_absolute():
download_dir = root_dir / download_dir
else:
download_dir = default_obs_download_dir
loader = DataLoader(
endpoint=obs_endpoint,
secret_key=obs_secret_key,
access_key=obs_access_key,
bucket_name=obs_bucket_name,
source="obs",
)
loader.download_path = str(download_dir)
files = loader.load_from_obs_source(obs_prefix)
resolved_files = []
for file_path in files:
resolved_files.append(Path(file_path).resolve())
return sorted(resolved_files)
def resolve_input_files() -> list[Path]: if qa_input_source == "obs": return download_files_from_obs() if qa_input_source in {"", "local"}: return discover_local_files() raise SystemExit(f"Unsupported QA_INPUT_SOURCE value: {qa_input_source}")
def main() -> None: save_parent_path.mkdir(parents=True, exist_ok=True)
input_files = resolve_input_files()
if not input_files:
raise SystemExit("No input files found for QA generation.")
for input_file in input_files:
input_file = input_file.resolve()
try:
relative_path = input_file.relative_to(root_dir)
except ValueError:
relative_path = Path(input_file.name)
relative_stem = relative_path.with_suffix("")
save_dir = save_parent_path / relative_stem.parent
save_dir.mkdir(parents=True, exist_ok=True)
save_path = save_dir / f"{relative_stem.name}_train"
dm = DataMax(file_path=str(input_file), to_markdown=True)
data = dm.get_data()
content = data.get("content")
qa = dm.get_pre_label(
content=content,
api_key=api_key,
base_url=base_url,
model_name=model,
question_number=50, # question_number_per_chunk
max_qps=100.0,
debug=False,
structured_data=True, # enable structured output
auto_self_review_mode=True,
review_max_qps=100.0,
)
dm.save_label_data(qa, str(save_path))
break
if name == "main": main()
nohup python examples/scripts/generate_qa.py > generate_qa.out 2>&1 & echo $! > generate_qa.pid¶
多模态 QA¶
""" Generate multimodal QA pairs from a Markdown file with images. Requires OPENAI_API_KEY or provide explicitly. """ import os from datamax.generator import generate_multimodal_qa_pairs
def main(): md_path = "examples/generate/example.md" # Ensure this MD contains image links api_key = os.getenv("OPENAI_API_KEY", "your-api-key") model = os.getenv("OPENAI_VISION_MODEL", "gpt-4o")
qa = generate_multimodal_qa_pairs(
file_path=md_path,
api_key=api_key,
model_name=model,
question_number=2,
max_qps=5.0,
)
print(f"Generated {len(qa)} multimodal QA pairs")
if name == "main": main()