Products
GG网络技术分享 2025-11-13 21:15 3
python
import dedupe import re from dedupe.comparator import Comparator from dedupe.tokenizer import WhitespaceTokenizer from dedupe.field import Field from dedupe.recordpair import recordpairs from dedupe.utils import gettrainingdata

class GazetteerComparator: def init: self.gazetteer = gazetteer self.tokenizer = tokenizer
def compare:
# 用自定义的gazetteer和tokenizer比比kan字段
value1 = self.tokenizer
value2 = self.tokenizer
return self.gazetteer.compare
class WhitespaceTokenizer: def tokenize: return text.split
data = # 从数据库或其他来源加载数据 fields = # 定义需要比比kan的字段
deduper = dedupe.Dedupe
deduper.sample
deduper.field, has_missing=True, comparator=GazetteerComparator, tokenizer=WhitespaceTokenizer))
fields = # 定义字段 deduper = dedupe.Dedupe deduper.sample deduper.prepare_training deduper.train
def record_pairs: for i, r1 in enumerate: for r2 in records: if re.match and re.match: yield r1, r2
deduper.num_cores = 4
deduper.sample_size = 10000
deduper.train
deduper.write_training
deduper.cleanup
deduper.field, limit=1000)
deduper.field, weight=1.5)
deduper.field, keep_original=True)
deduper.field, fuzzy_match=True)
Demand feedback