API 参考(精选)
解析与清洗
from datamax import DataMax
dm = DataMax(
file_path: str | list,
domain: str = "Technology",
to_markdown: bool = False,
use_mineru: bool = False,
use_qwen_vl_ocr: bool = False,
use_mllm: bool = False,
mllm_system_prompt: str | None = None,
api_key: str | None = None,
base_url: str | None = None,
model_name: str | None = None,
)
dm.get_data() -> dict | list[dict]
dm.clean_data(methods: list[str], text: str | None = None) -> dict | str
标注与生成
dm.get_pre_label(
api_key: str,
base_url: str,
model_name: str,
question_number: int = 5,
max_qps: float = 5.0,
use_tree_label: bool = False,
interactive_tree: bool = False,
messages: list | None = None,
) -> list | dict
dm.save_label_data(label_data: list | dict, save_file_name: str = "qa_pairs") -> None
from datamax.generator import full_qa_labeling_process, generate_multimodal_qa_pairs
爬虫
from datamax.crawler import crawl
crawl(keyword_or_url: str, engine: str = "auto" | "web" | "arxiv") -> dict
from datamax.parser import CrawlerParser
CrawlerParser(file_path: str).parse() -> MarkdownOutputVo
清洗(独立类)
from datamax.cleaner import AbnormalCleaner, TextFilter, PrivacyDesensitization
评估
from datamax.evaluator import TextQualityEvaluator, MultimodalConsistencyEvaluator
输出结构
{
"extension": "md",
"content": "...",
"lifecycle": [
{
"update_time": "2025-01-01 12:00:00",
"life_type": ["DATA_PROCESSING"],
"life_metadata": {"source_file": "...", "domain": "...", "usage_purpose": "..."}
}
]
}