OpenClaw 精简版 文本信息提取工具

openclaw OpenClaw手册 2

OpenClaw 是一个轻量级的文本信息提取框架,专注于从非结构化文本中提取结构化信息,以下是一个简化版本的设计。

OpenClaw 精简版 文本信息提取工具-第1张图片-OpenClaw 开源免费 -中文免费安装

核心架构

class OpenClaw:
    def __init__(self):
        self.extractors = {}
        self.preprocessors = []
    def add_extractor(self, name, extractor):
        """添加提取器"""
        self.extractors[name] = extractor
    def add_preprocessor(self, preprocessor):
        """添加预处理函数"""
        self.preprocessors.append(preprocessor)
    def extract(self, text, extractor_names=None):
        """执行信息提取"""
        # 预处理
        processed_text = text
        for preprocessor in self.preprocessors:
            processed_text = preprocessor(processed_text)
        # 提取
        results = {}
        extractors_to_use = extractor_names if extractor_names else self.extractors.keys()
        for name in extractors_to_use:
            if name in self.extractors:
                results[name] = self.extractors[name](processed_text)
        return results

内置提取器

正则表达式提取器

import re
from typing import List, Dict
class RegexExtractor:
    def __init__(self, patterns: Dict[str, str]):
        """
        patterns: 模式字典 {字段名: 正则表达式}
        """
        self.patterns = patterns
    def __call__(self, text: str) -> Dict[str, List[str]]:
        results = {}
        for field, pattern in self.patterns.items():
            matches = re.findall(pattern, text)
            results[field] = matches
        return results

关键词上下文提取器

class KeywordContextExtractor:
    def __init__(self, keywords: List[str], window: int = 50):
        self.keywords = keywords
        self.window = window
    def __call__(self, text: str) -> Dict[str, List[str]]:
        results = {"contexts": []}
        for keyword in self.keywords:
            if keyword in text:
                start = max(0, text.find(keyword) - self.window)
                end = min(len(text), text.find(keyword) + len(keyword) + self.window)
                context = text[start:end]
                results["contexts"].append({
                    "keyword": keyword,
                    "context": context
                })
        return results

示例使用

# 创建OpenClaw实例
claw = OpenClaw()
# 添加预处理函数
def clean_text(text):
    """清理文本"""
    import re
    # 移除多余空格
    text = re.sub(r'\s+', ' ', text)
    # 移除特殊字符
    text = re.sub(r'[^\w\s.,!?\-:]', '', text)
    return text.strip()
claw.add_preprocessor(clean_text)
# 添加提取器 - 提取邮箱和电话
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
email_extractor = RegexExtractor({"emails": email_pattern})
phone_extractor = RegexExtractor({"phones": phone_pattern})
claw.add_extractor("emails", email_extractor)
claw.add_extractor("phones", phone_extractor)
# 添加关键词提取器
keyword_extractor = KeywordContextExtractor(["价格", "费用", "成本"], window=30)
claw.add_extractor("price_context", keyword_extractor)
# 使用示例
sample_text = """
联系人:张三,电话:138-1234-5678,邮箱:zhangsan@example.com
产品价格:2999元,安装费用:500元,总成本约3500元。
"""
result = claw.extract(sample_text)
print(result)

输出结果

{
    "emails": {"emails": ["zhangsan@example.com"]},
    "phones": {"phones": ["138-1234-5678"]},
    "price_context": {
        "contexts": [
            {"keyword": "价格", "context": "产品价格:2999元,安装费用:500"},
            {"keyword": "费用", "context": "价格:2999元,安装费用:500元,总成本"},
            {"keyword": "成本", "context": "费用:500元,总成本约3500元。"}
        ]
    }
}

规则引擎扩展

class RuleEngine:
    def __init__(self):
        self.rules = []
    def add_rule(self, condition_func, action_func):
        """添加规则:条件函数和动作函数"""
        self.rules.append((condition_func, action_func))
    def apply(self, text, context=None):
        """应用所有规则"""
        results = []
        for condition, action in self.rules:
            if condition(text, context):
                results.append(action(text, context))
        return results
# 示例规则
rule_engine = RuleEngine()
# 规则1:如果包含"发票"和"金额",提取金额
def has_invoice_amount(text, context):
    return "发票" in text and "金额" in text
def extract_invoice_amount(text, context):
    import re
    amounts = re.findall(r'金额[::]?\s*(\d+(?:\.\d+)?)', text)
    return {"type": "invoice_amount", "values": amounts}
rule_engine.add_rule(has_invoice_amount, extract_invoice_amount)

简单CRF模型接口(可选)

class SimpleCRFExtractor:
    """简化的CRF提取器(需要安装sklearn-crfsuite)"""
    def __init__(self, model_path=None):
        self.model = None
        if model_path:
            self.load_model(model_path)
    def load_model(self, model_path):
        # 加载预训练模型
        import pickle
        with open(model_path, 'rb') as f:
            self.model = pickle.load(f)
    def extract(self, text):
        if not self.model:
            raise ValueError("模型未加载")
        # 将文本转换为特征
        features = self.text_to_features(text)
        # 预测标签
        labels = self.model.predict([features])[0]
        # 提取实体
        entities = self.labels_to_entities(text, labels)
        return entities
    def text_to_features(self, text):
        # 简化的特征提取(实际应用需要更复杂的特征工程)
        words = text.split()
        features = []
        for i, word in enumerate(words):
            feature = {
                'word': word,
                'word_lower': word.lower(),
                'is_digit': word.isdigit(),
                'prefix': word[:3] if len(word) >= 3 else word,
                'suffix': word[-3:] if len(word) >= 3 else word,
                'prev_word': words[i-1] if i > 0 else '<START>',
                'next_word': words[i+1] if i < len(words)-1 else '<END>'
            }
            features.append(feature)
        return features
    def labels_to_entities(self, text, labels):
        # 将标签序列转换为实体
        entities = []
        current_entity = None
        words = text.split()
        for i, (word, label) in enumerate(zip(words, labels)):
            if label.startswith('B-'):
                if current_entity:
                    entities.append(current_entity)
                current_entity = {
                    'text': word,
                    'type': label[2:],
                    'start': i
                }
            elif label.startswith('I-') and current_entity:
                current_entity['text'] += ' ' + word
            elif label == 'O' and current_entity:
                current_entity['end'] = i - 1
                entities.append(current_entity)
                current_entity = None
        return entities

使用示例

# 完整示例
def main():
    # 1. 初始化
    claw = OpenClaw()
    # 2. 添加预处理
    claw.add_preprocessor(clean_text)
    # 3. 配置提取器
    patterns = {
        "date": r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
        "money": r'¥\s*\d+(?:\.\d+)?|\$\s*\d+(?:\.\d+)?',
        "percentage": r'\d+(?:\.\d+)?%'
    }
    claw.add_extractor("patterns", RegexExtractor(patterns))
    # 4. 提取信息
    text = "会议日期:2024-01-15,预算:$5000,完成率:85.5%"
    results = claw.extract(text)
    # 5. 输出结果
    print("提取结果:")
    for key, value in results.items():
        print(f"{key}: {value}")
if __name__ == "__main__":
    main()

这个精简版 OpenClaw 提供了:

  1. 模块化设计:可轻松添加新的提取器和预处理器
  2. 多种提取方式:正则表达式、关键词上下文、规则引擎
  3. 可扩展性:支持集成更复杂的机器学习模型
  4. 轻量级:无需复杂依赖,核心功能简洁明了

可以根据具体需求进一步扩展功能,如添加:

  • 更多内置提取器
  • 结果后处理
  • 提取结果验证
  • 多语言支持
  • 批量处理功能

标签: OpenClaw 文本信息提取

抱歉,评论功能暂时关闭!