引言
在信息时代,数据压缩是关键技术。文本压缩和语料压缩是两种不同的压缩思路:前者在保持内容不变的前提下减少存储空间,后者通过语义理解减少信息量。本文介绍它们的原理、经典算法和使用场景。
第一部分:文本压缩(Storage Compression)
1.1 什么是文本压缩?
文本压缩是在保持内容完全不变的前提下,通过算法减少存储空间的技术。压缩后的数据可以精确还原为原始文本。
核心特征:
- ✅ 无损压缩:内容100%保持不变
- ✅ 可精确还原:解压后字节级完全一致
- ✅ 存储优化:减少存储空间和传输带宽
- ✅ 通用性强:适用于任意文本数据
1.2 文本压缩的基本原理
1.2.1 信息熵理论
信息熵衡量信息的不确定性。重复模式越多,熵越低,压缩潜力越大。
高熵文本(难以压缩):
"a7b3c9d2e8f1g5h4i6j0" # 随机字符,重复少
低熵文本(易于压缩):
"aaaaaaaaaa" # 高度重复,压缩比高
1.2.2 压缩策略
- 消除冗余:识别并压缩重复模式
- 统计编码:高频字符用短码,低频字符用长码
- 字典压缩:用指针引用已出现的字符串
1.3 经典文本压缩算法
算法1:游程编码(Run-Length Encoding, RLE)
原理: 将连续重复的字符用”字符+次数”表示
适用场景: 大量连续重复字符的文本(如空格、制表符)
实现示例:
def rle_encode(text):
"""RLE编码"""
if not text:
return ""
encoded = []
count = 1
char = text[0]
for i in range(1, len(text)):
if text[i] == char:
count += 1
else:
if count > 1:
encoded.append(f"{char}{count}")
else:
encoded.append(char)
char = text[i]
count = 1
# 处理最后一个字符
if count > 1:
encoded.append(f"{char}{count}")
else:
encoded.append(char)
return "".join(encoded)
def rle_decode(encoded):
"""RLE解码"""
decoded = []
i = 0
while i < len(encoded):
char = encoded[i]
if i + 1 < len(encoded) and encoded[i + 1].isdigit():
# 提取数字
num_str = ""
j = i + 1
while j < len(encoded) and encoded[j].isdigit():
num_str += encoded[j]
j += 1
count = int(num_str)
decoded.append(char * count)
i = j
else:
decoded.append(char)
i += 1
return "".join(decoded)
# 示例
original = "AAAAABBBCCCCCCDDD"
compressed = rle_encode(original) # "A5B3C6D3"
decompressed = rle_decode(compressed) # "AAAAABBBCCCCCCDDD"
assert original == decompressed # ✅ 完全一致
压缩效果:
- 原始:17字节
- 压缩:8字节
- 压缩比:47%
算法2:霍夫曼编码(Huffman Coding)
原理: 根据字符出现频率构建最优前缀码树,高频字符用短码
适用场景: 字符频率分布不均匀的文本
实现示例:
import heapq
from collections import Counter
class HuffmanNode:
def __init__(self, char, freq, left=None, right=None):
self.char = char
self.freq = freq
self.left = left
self.right = right
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(text):
"""构建霍夫曼树"""
freq = Counter(text)
# 创建叶子节点
heap = [HuffmanNode(char, freq) for char, freq in freq.items()]
heapq.heapify(heap)
# 构建树
while len(heap) > 1:
left = heapq.heappop(heap)
right = heapq.heappop(heap)
merged = HuffmanNode(None, left.freq + right.freq, left, right)
heapq.heappush(heap, merged)
return heap[0]
def build_codes(node, code='', codes={}):
"""生成编码表"""
if node.char:
codes[node.char] = code if code else '0'
else:
build_codes(node.left, code + '0', codes)
build_codes(node.right, code + '1', codes)
return codes
def huffman_encode(text, codes):
"""霍夫曼编码"""
return ''.join(codes[char] for char in text)
def huffman_decode(encoded, codes):
"""霍夫曼解码"""
# 反转编码表
reverse_codes = {v: k for k, v in codes.items()}
decoded = []
current_code = ''
for bit in encoded:
current_code += bit
if current_code in reverse_codes:
decoded.append(reverse_codes[current_code])
current_code = ''
return ''.join(decoded)
# 示例
text = "this is an example of a huffman tree"
tree = build_huffman_tree(text)
codes = build_codes(tree)
encoded = huffman_encode(text, codes)
decoded = huffman_decode(encoded, codes)
assert text == decoded # ✅ 完全一致
print("字符编码表:")
for char, code in sorted(codes.items(), key=lambda x: len(x[1])):
print(f"'{char}': {code}")
压缩效果:
- 原始文本:37字符 × 8位 = 296位
- 压缩后:约150位(取决于频率分布)
- 压缩比:约50%
算法3:LZ77压缩算法
原理: 用”距离+长度+下一个字符”的三元组表示重复字符串
适用场景: 通用文本压缩,是ZIP、GZIP的基础
实现示例:
def lz77_compress(data, window_size=4096, lookahead_buffer=18):
"""
LZ77压缩算法
Args:
data: 输入数据
window_size: 搜索窗口大小
lookahead_buffer: 前瞻缓冲区大小
"""
compressed = []
i = 0
while i < len(data):
# 搜索窗口
search_start = max(0, i - window_size)
search_end = i
search_buffer = data[search_start:search_end]
# 前瞻缓冲区
lookahead_end = min(i + lookahead_buffer, len(data))
lookahead = data[i:lookahead_end]
# 查找最长匹配
best_match = (0, 0, data[i] if i < len(data) else None)
for j in range(len(search_buffer)):
match_length = 0
while (match_length < len(lookahead) and
j + match_length < len(search_buffer) and
search_buffer[j + match_length] == lookahead[match_length]):
match_length += 1
if match_length > best_match[1]:
distance = len(search_buffer) - j
next_char = lookahead[match_length] if match_length < len(lookahead) else None
best_match = (distance, match_length, next_char)
compressed.append(best_match)
i += best_match[1] + 1
return compressed
def lz77_decompress(compressed):
"""LZ77解压缩"""
decompressed = []
for distance, length, next_char in compressed:
if distance > 0 and length > 0:
# 从已解压数据中复制
start = len(decompressed) - distance
for i in range(length):
decompressed.append(decompressed[start + i])
if next_char:
decompressed.append(next_char)
return ''.join(decompressed)
# 示例
text = "abracadabraabracadabra"
compressed = lz77_compress(text)
decompressed = lz77_decompress(compressed)
assert text == decompressed # ✅ 完全一致
print(f"原始长度: {len(text)}")
print(f"压缩后元组数: {len(compressed)}")
print(f"压缩元组: {compressed[:5]}...") # 显示前5个
压缩效果:
- 原始:22字符
- 压缩:约10-12个三元组(取决于重复模式)
- 压缩比:约45-55%
算法4:LZ78压缩算法
原理: 动态构建字典,用字典索引+新字符表示字符串
适用场景: 适合重复模式不连续的情况
实现示例:
def lz78_compress(data):
"""LZ78压缩"""
dictionary = {}
compressed = []
i = 0
while i < len(data):
# 查找最长匹配
match_index = 0
match_length = 0
for j in range(1, i + 1):
if data[i:i+j] in dictionary:
match_index = dictionary[data[i:i+j]]
match_length = j
else:
break
# 添加新字符串到字典
if i + match_length < len(data):
new_string = data[i:i+match_length+1]
dictionary[new_string] = len(dictionary) + 1
compressed.append((match_index, data[i+match_length]))
else:
compressed.append((match_index, ''))
i += match_length + 1
return compressed, dictionary
def lz78_decompress(compressed):
"""LZ78解压缩"""
dictionary = {0: ''}
decompressed = []
for index, char in compressed:
if index in dictionary:
string = dictionary[index] + char
decompressed.append(string)
dictionary[len(dictionary)] = string
else:
decompressed.append(char)
return ''.join(decompressed)
# 示例
text = "ababababab"
compressed, dictionary = lz78_compress(text)
decompressed = lz78_decompress(compressed)
assert text == decompressed # ✅ 完全一致
1.4 现代文本压缩格式
GZIP(基于DEFLATE)
组成: LZ77 + 霍夫曼编码
特点:
- 压缩比:通常2-10倍
- 速度:快速
- 应用:HTTP传输、文件压缩
使用示例:
import gzip
# 压缩
text = "重复的文本内容 " * 1000
compressed = gzip.compress(text.encode('utf-8'))
print(f"原始: {len(text)} 字节")
print(f"压缩: {len(compressed)} 字节")
print(f"压缩比: {len(compressed)/len(text):.2%}")
# 解压
decompressed = gzip.decompress(compressed)
assert text.encode('utf-8') == decompressed # ✅ 完全一致
BZIP2(基于Burrows-Wheeler变换)
特点:
- 压缩比:通常比GZIP高20-30%
- 速度:较慢
- 应用:备份、归档
7Z(LZMA算法)
特点:
- 压缩比:通常最高
- 速度:最慢
- 应用:长期归档
1.5 文本压缩的使用场景
场景1:文件传输
# HTTP响应压缩
import gzip
from flask import Flask, Response
app = Flask(__name__)
@app.route('/api/data')
def get_data():
data = "大量文本数据..."
compressed = gzip.compress(data.encode('utf-8'))
return Response(
compressed,
mimetype='application/gzip',
headers={'Content-Encoding': 'gzip'}
)
场景2:数据备份
import zipfile
import os
def backup_directory(source_dir, backup_file):
"""备份目录到ZIP文件"""
with zipfile.ZipFile(backup_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(source_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, source_dir)
zipf.write(file_path, arcname)
original_size = sum(
os.path.getsize(os.path.join(root, f))
for root, _, files in os.walk(source_dir)
for f in files
)
compressed_size = os.path.getsize(backup_file)
print(f"原始大小: {original_size:,} 字节")
print(f"压缩后: {compressed_size:,} 字节")
print(f"压缩比: {compressed_size/original_size:.2%}")
场景3:数据库压缩
# 存储压缩的文本字段
import sqlite3
import gzip
def store_compressed_text(db_path, text):
"""存储压缩文本到数据库"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
compressed = gzip.compress(text.encode('utf-8'))
cursor.execute(
"INSERT INTO documents (content) VALUES (?)",
(compressed,)
)
conn.commit()
conn.close()
def retrieve_text(db_path, doc_id):
"""从数据库检索并解压文本"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT content FROM documents WHERE id = ?", (doc_id,))
compressed = cursor.fetchone()[0]
text = gzip.decompress(compressed).decode('utf-8')
conn.close()
return text
场景4:日志压缩
import gzip
from datetime import datetime
def compress_log_file(log_file, compressed_file):
"""压缩日志文件"""
with open(log_file, 'rb') as f_in:
with gzip.open(compressed_file, 'wb') as f_out:
f_out.writelines(f_in)
original_size = os.path.getsize(log_file)
compressed_size = os.path.getsize(compressed_file)
print(f"日志压缩完成")
print(f"原始: {original_size:,} 字节")
print(f"压缩: {compressed_size:,} 字节")
print(f"节省: {original_size - compressed_size:,} 字节")
第二部分:语料压缩(Information Compression)
2.1 什么是语料压缩?
语料压缩通过语义理解减少信息量,保留关键语义,丢弃细节。本质是信息提取和总结,而非存储优化。
核心特征:
- ✅ 语义保留:保留关键信息和语义
- ❌ 内容变化:丢失细节信息
- ❌ 无法精确还原:只能近似恢复
- ✅ 信息量减少:大幅减少信息量
2.2 语料压缩的基本原理
2.2.1 语义理解
大模型理解文本语义,区分重要信息和次要信息。
原始文本(1000字):
"2024年1月15日上午9点,在北京市海淀区中关村大街1号,
公司总部3楼会议室召开了年度总结会议。参会人员包括:
CEO张三(工号001)、CTO李四(工号002)..."
语料压缩后(200字):
"2024年1月15日,公司召开年度总结会议,讨论2023年业绩
和2024年规划,决定研发预算增加30%,扩招50人。"
信息量减少80%,但关键语义保留
2.2.2 压缩策略
- 摘要生成:生成文本摘要
- 关键点提取:提取结构化关键信息
- 语义嵌入:转换为低维向量表示
- 知识图谱:提取为三元组结构
2.3 语料压缩方法
方法1:摘要压缩(Summarization)
原理: 使用大模型生成文本摘要
实现示例:
import openai
class SummarizationCompressor:
"""摘要压缩器"""
def __init__(self, api_key, model="gpt-4"):
self.api_key = api_key
self.model = model
openai.api_key = api_key
def compress(self, text, compression_ratio=0.1):
"""
压缩文本到指定比例
Args:
text: 原始文本
compression_ratio: 目标压缩比(0.1 = 10%)
"""
prompt = f"""请将以下文本压缩到原长度的{compression_ratio*100}%,
保留所有关键信息和语义:
原文:
{text}
要求:
1. 保留所有关键事实、数据和结论
2. 使用简洁的语言
3. 保持逻辑结构
4. 输出压缩后的文本:"""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{
"role": "system",
"content": "你是专业的文本压缩专家,擅长在保持语义完整性的同时大幅压缩文本。"
},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=int(len(text) * compression_ratio)
)
return response.choices[0].message.content
def extractive_summarize(self, text, num_sentences=3):
"""提取式摘要(保留原文句子)"""
prompt = f"""从以下文本中提取最重要的{num_sentences}句话:
{text}
要求:
1. 选择最能代表原文核心内容的句子
2. 保持原句不变
3. 按重要性排序"""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{"role": "system", "content": "你是文本摘要专家。"},
{"role": "user", "content": prompt}
],
temperature=0.2
)
return response.choices[0].message.content
# 使用示例
compressor = SummarizationCompressor(api_key="your-key")
original = "很长的文本内容..."
compressed = compressor.compress(original, compression_ratio=0.1)
print(f"原始长度: {len(original)}")
print(f"压缩后: {len(compressed)}")
print(f"信息量减少: {(1 - len(compressed)/len(original))*100:.1f}%")
压缩效果:
- 信息量减少:70-90%
- 语义保留:关键信息保留
- 细节丢失:具体细节被丢弃
方法2:关键点提取(Key Points Extraction)
原理: 提取结构化关键信息
实现示例:
import json
class KeyPointsCompressor:
"""关键点提取压缩器"""
def __init__(self, api_key):
self.api_key = api_key
openai.api_key = api_key
def extract_keypoints(self, text):
"""提取关键点"""
prompt = f"""请将以下文本提取为关键点,使用JSON格式输出:
原文:
{text}
输出格式:
{{
"主题": "文本主题",
"关键点": [
"要点1",
"要点2",
"要点3"
],
"重要数据": {{
"数据1": "值1",
"数据2": "值2"
}},
"结论": "主要结论",
"时间": "相关时间",
"地点": "相关地点",
"人物": ["人物1", "人物2"]
}}"""
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "你是信息提取专家,擅长将文本提取为结构化关键点。"
},
{"role": "user", "content": prompt}
],
temperature=0.2
)
result = response.choices[0].message.content
# 尝试解析JSON
try:
# 移除可能的markdown代码块标记
if result.startswith("```json"):
result = result[7:]
if result.startswith("```"):
result = result[3:]
if result.endswith("```"):
result = result[:-3]
result = result.strip()
return json.loads(result)
except:
return {"raw": result}
def keypoints_to_text(self, keypoints):
"""从关键点恢复文本(近似)"""
prompt = f"""请根据以下关键点生成一段连贯的文本:
关键点:
{json.dumps(keypoints, ensure_ascii=False, indent=2)}
要求:
1. 生成自然流畅的文本
2. 包含所有关键信息
3. 保持逻辑关系"""
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "你是文本生成专家。"},
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content
# 使用示例
compressor = KeyPointsCompressor(api_key="your-key")
original = "详细的会议记录..."
keypoints = compressor.extract_keypoints(original)
# 存储关键点(比原文小得多)
compressed_size = len(json.dumps(keypoints, ensure_ascii=False))
original_size = len(original)
print(f"原始: {original_size} 字符")
print(f"压缩: {compressed_size} 字符")
print(f"压缩比: {compressed_size/original_size:.2%}")
# 恢复文本(近似)
recovered = compressor.keypoints_to_text(keypoints)
方法3:语义嵌入压缩(Semantic Embedding)
原理: 将文本转换为低维向量表示
实现示例:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import gzip
class SemanticEmbeddingCompressor:
"""语义嵌入压缩器"""
def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
self.model = SentenceTransformer(model_name)
self.embedding_dim = 384 # 根据模型调整
def compress(self, text, chunk_size=512):
"""
将文本转换为语义嵌入向量
Args:
text: 原始文本
chunk_size: 分块大小
"""
# 分块处理长文本
chunks = [
text[i:i+chunk_size]
for i in range(0, len(text), chunk_size)
]
# 生成嵌入向量
embeddings = self.model.encode(chunks, convert_to_numpy=True)
# 量化压缩
compressed = self.quantize(embeddings)
return {
'embeddings': compressed,
'chunk_size': chunk_size,
'original_length': len(text),
'compressed_size': compressed['data'].nbytes
}
def quantize(self, embeddings, bits=8):
"""量化嵌入向量以减少存储"""
min_val = embeddings.min()
max_val = embeddings.max()
# 归一化到[0, 1]
normalized = (embeddings - min_val) / (max_val - min_val + 1e-8)
# 量化
quantized = (normalized * (2**bits - 1)).astype(np.uint8)
return {
'data': quantized,
'min': min_val,
'max': max_val,
'shape': embeddings.shape
}
def decompress(self, compressed_data):
"""从嵌入向量恢复(近似)"""
quantized = compressed_data['embeddings']
# 反量化
normalized = quantized['data'].astype(np.float32) / (2**8 - 1)
embeddings = (
normalized * (quantized['max'] - quantized['min']) +
quantized['min']
)
return embeddings
def save(self, compressed_data, filepath):
"""保存压缩数据"""
with gzip.open(filepath, 'wb') as f:
pickle.dump(compressed_data, f)
def load(self, filepath):
"""加载压缩数据"""
with gzip.open(filepath, 'rb') as f:
return pickle.load(f)
# 使用示例
compressor = SemanticEmbeddingCompressor()
text = "很长的文本内容..."
# 压缩
compressed = compressor.compress(text)
print(f"原始: {compressed['original_length']} 字符")
print(f"压缩后: {compressed['compressed_size']} 字节")
print(f"压缩比: {compressed['compressed_size']/compressed['original_length']:.2%}")
# 保存
compressor.save(compressed, 'compressed.pkl.gz')
# 加载
loaded = compressor.load('compressed.pkl.gz')
embeddings = compressor.decompress(loaded)
压缩效果:
- 文本:1000字符 × 1字节 = 1000字节
- 嵌入:10块 × 384维 × 1字节(量化后)= 3840字节
- 但支持语义检索和相似度计算
方法4:知识图谱压缩(Knowledge Graph)
原理: 将文本提取为知识图谱(三元组)
实现示例:
class KnowledgeGraphCompressor:
"""知识图谱压缩器"""
def __init__(self, api_key):
self.api_key = api_key
openai.api_key = api_key
def text_to_kg(self, text):
"""将文本转换为知识图谱"""
prompt = f"""请将以下文本提取为知识图谱,使用JSON格式输出三元组:
原文:
{text}
输出格式:
{{
"entities": [
{{"id": 1, "name": "实体1", "type": "类型"}},
{{"id": 2, "name": "实体2", "type": "类型"}}
],
"relations": [
{{"subject": 1, "predicate": "关系", "object": 2}},
{{"subject": 2, "predicate": "关系", "object": 3}}
],
"attributes": [
{{"entity": 1, "attribute": "属性名", "value": "属性值"}}
]
}}"""
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "你是知识图谱提取专家。"},
{"role": "user", "content": prompt}
],
temperature=0.2
)
result = response.choices[0].message.content
# 解析JSON
try:
if result.startswith("```json"):
result = result[7:]
if result.startswith("```"):
result = result[3:]
if result.endswith("```"):
result = result[:-3]
return json.loads(result.strip())
except:
return {"raw": result}
def kg_to_text(self, kg):
"""从知识图谱恢复文本(近似)"""
prompt = f"""请根据以下知识图谱生成一段连贯的文本:
知识图谱:
{json.dumps(kg, ensure_ascii=False, indent=2)}
要求:
1. 生成自然流畅的文本
2. 包含所有关键信息
3. 保持逻辑关系"""
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "你是文本生成专家。"},
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content
# 使用示例
compressor = KnowledgeGraphCompressor(api_key="your-key")
text = "张三在北京大学工作,担任计算机科学教授..."
kg = compressor.text_to_kg(text)
# 存储知识图谱(比原文小得多)
kg_size = len(json.dumps(kg, ensure_ascii=False))
text_size = len(text)
print(f"原始: {text_size} 字符")
print(f"知识图谱: {kg_size} 字符")
print(f"压缩比: {kg_size/text_size:.2%}")
# 恢复文本
recovered = compressor.kg_to_text(kg)
2.4 语料压缩的使用场景
场景1:文档摘要系统
class DocumentSummarizer:
"""文档摘要系统"""
def __init__(self, api_key):
self.compressor = SummarizationCompressor(api_key)
def process_document(self, document_path):
"""处理文档并生成摘要"""
with open(document_path, 'r', encoding='utf-8') as f:
content = f.read()
# 生成摘要
summary = self.compressor.compress(content, compression_ratio=0.1)
# 保存摘要
summary_path = document_path.replace('.txt', '_summary.txt')
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
return summary
# 使用
summarizer = DocumentSummarizer(api_key="your-key")
summary = summarizer.process_document("report.txt")
场景2:知识库构建
class KnowledgeBaseBuilder:
"""知识库构建器"""
def __init__(self, api_key):
self.kg_compressor = KnowledgeGraphCompressor(api_key)
self.kb = []
def add_document(self, text):
"""添加文档到知识库"""
kg = self.kg_compressor.text_to_kg(text)
self.kb.append({
'id': len(self.kb) + 1,
'kg': kg,
'timestamp': datetime.now().isoformat()
})
def search(self, query):
"""在知识库中搜索"""
# 使用语义相似度搜索
# 实现搜索逻辑...
pass
def save(self, filepath):
"""保存知识库"""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(self.kb, f, ensure_ascii=False, indent=2)
# 使用
kb = KnowledgeBaseBuilder(api_key="your-key")
kb.add_document("文档1内容...")
kb.add_document("文档2内容...")
kb.save("knowledge_base.json")
场景3:新闻摘要服务
class NewsSummarizer:
"""新闻摘要服务"""
def __init__(self, api_key):
self.compressor = SummarizationCompressor(api_key)
def summarize_news(self, news_article):
"""生成新闻摘要"""
# 提取关键信息
keypoints = KeyPointsCompressor(api_key).extract_keypoints(news_article)
# 生成摘要
summary = self.compressor.compress(news_article, compression_ratio=0.15)
return {
'summary': summary,
'keypoints': keypoints,
'original_length': len(news_article),
'summary_length': len(summary)
}
# 使用
summarizer = NewsSummarizer(api_key="your-key")
result = summarizer.summarize_news("长篇新闻文章...")
print(f"摘要: {result['summary']}")
print(f"压缩比: {result['summary_length']/result['original_length']:.2%}")
场景4:会议记录压缩
class MeetingNotesCompressor:
"""会议记录压缩器"""
def __init__(self, api_key):
self.compressor = KeyPointsCompressor(api_key)
def compress_meeting(self, meeting_notes):
"""压缩会议记录"""
keypoints = self.compressor.extract_keypoints(meeting_notes)
return {
'date': keypoints.get('时间', ''),
'location': keypoints.get('地点', ''),
'participants': keypoints.get('人物', []),
'key_points': keypoints.get('关键点', []),
'decisions': keypoints.get('结论', ''),
'action_items': keypoints.get('重要数据', {})
}
# 使用
compressor = MeetingNotesCompressor(api_key="your-key")
meeting_notes = "详细的会议记录..."
compressed = compressor.compress_meeting(meeting_notes)
print(json.dumps(compressed, ensure_ascii=False, indent=2))
第三部分:对比分析
3.1 核心区别总结
| 维度 | 文本压缩 | 语料压缩 |
|---|---|---|
| 目标 | 减少存储空间 | 减少信息量 |
| 方法 | 算法编码 | 语义理解 |
| 内容变化 | ❌ 不变 | ✅ 变化 |
| 可还原性 | ✅ 100%还原 | ❌ 无法精确还原 |
| 压缩比 | 2-10倍 | 10-100倍(信息量) |
| 速度 | 快 | 慢 |
| 成本 | 低 | 高(需要API) |
| 适用场景 | 文件传输、备份 | 文档摘要、知识提取 |
3.2 选择指南
使用文本压缩的场景:
- ✅ 需要完整保留文件内容
- ✅ 文件传输和存储优化
- ✅ 代码、配置文件压缩
- ✅ 数据库备份
- ✅ 日志归档
使用语料压缩的场景:
- ✅ 只需要关键信息
- ✅ 文档摘要生成
- ✅ 知识库构建
- ✅ 信息检索系统
- ✅ 内容归档(保留要点)
3.3 混合使用策略
class HybridCompressor:
"""混合压缩策略"""
def __init__(self, api_key):
self.text_compressor = gzip # 文本压缩
self.corpus_compressor = SummarizationCompressor(api_key) # 语料压缩
def compress(self, text, mode='hybrid'):
"""
混合压缩
mode: 'text' | 'corpus' | 'hybrid'
"""
if mode == 'text':
# 纯文本压缩
return self.text_compressor.compress(text.encode('utf-8'))
elif mode == 'corpus':
# 纯语料压缩
summary = self.corpus_compressor.compress(text, compression_ratio=0.1)
return summary.encode('utf-8')
elif mode == 'hybrid':
# 先语料压缩,再文本压缩
summary = self.corpus_compressor.compress(text, compression_ratio=0.1)
compressed = self.text_compressor.compress(summary.encode('utf-8'))
return compressed
# 使用
compressor = HybridCompressor(api_key="your-key")
text = "很长的文本..."
# 文本压缩(保持完整)
text_compressed = compressor.compress(text, mode='text')
# 语料压缩(保留关键信息)
corpus_compressed = compressor.compress(text, mode='corpus')
# 混合压缩(最大压缩)
hybrid_compressed = compressor.compress(text, mode='hybrid')
第四部分:实际应用案例
案例1:大型文档管理系统
需求: 存储大量文档,需要快速检索和节省空间
解决方案:
class DocumentManagementSystem:
"""文档管理系统"""
def __init__(self, api_key):
self.text_compressor = gzip # 完整文档用文本压缩
self.corpus_compressor = KeyPointsCompressor(api_key) # 索引用语料压缩
def store_document(self, doc_id, content):
"""存储文档"""
# 完整文档:文本压缩存储
compressed_content = self.text_compressor.compress(
content.encode('utf-8')
)
# 索引:语料压缩(关键点)
keypoints = self.corpus_compressor.extract_keypoints(content)
# 存储到数据库
db.store(doc_id, {
'content': compressed_content,
'keypoints': keypoints,
'original_size': len(content),
'compressed_size': len(compressed_content)
})
def search_documents(self, query):
"""搜索文档"""
# 在关键点中搜索(快速)
results = db.search_keypoints(query)
return results
def retrieve_document(self, doc_id):
"""检索完整文档"""
data = db.get(doc_id)
content = self.text_compressor.decompress(data['content'])
return content.decode('utf-8')
案例2:新闻聚合平台
需求: 聚合大量新闻,生成摘要,节省存储
解决方案:
class NewsAggregationPlatform:
"""新闻聚合平台"""
def __init__(self, api_key):
self.summarizer = SummarizationCompressor(api_key)
self.kg_extractor = KnowledgeGraphCompressor(api_key)
def process_news(self, article):
"""处理新闻文章"""
# 生成摘要
summary = self.summarizer.compress(article, compression_ratio=0.15)
# 提取知识图谱
kg = self.kg_extractor.text_to_kg(article)
return {
'title': kg.get('主题', ''),
'summary': summary,
'keypoints': kg.get('关键点', []),
'entities': [e['name'] for e in kg.get('entities', [])],
'timestamp': datetime.now().isoformat()
}
总结
文本压缩
- 本质: 存储空间优化
- 方法: 算法编码(LZ77、霍夫曼等)
- 特点: 无损、可精确还原
- 适用: 文件传输、备份、存储优化
语料压缩
- 本质: 信息量减少
- 方法: 语义理解、总结提取
- 特点: 有损、保留关键语义
- 适用: 文档摘要、知识提取、信息检索
最佳实践
- 需要完整内容 → 使用文本压缩
- 只需关键信息 → 使用语料压缩
- 混合场景 → 结合使用两种方法