Fresh start - excluded large ROM JSON files
This commit is contained in:
249
tools/memory_embedding_worker.py
Normal file
249
tools/memory_embedding_worker.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Memory Embedding Worker
|
||||
=======================
|
||||
Process memory files and store with embeddings in SQLite.
|
||||
Runs as cron job or standalone.
|
||||
|
||||
Usage:
|
||||
python memory_embedding_worker.py [--date YYYY-MM-DD]
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
# Add parent dir to path for memory_vector
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Tuple, Optional
|
||||
from memory_vector import setup_memory_vectors, store_memory, MemoryVectorDB
|
||||
|
||||
# Ollama configuration
|
||||
OLLAMA_URL = "http://localhost:11434"
|
||||
EMBED_MODEL = "nomic-embed-text"
|
||||
CHUNK_SIZE = 512 # Characters per chunk
|
||||
|
||||
|
||||
def generate_embedding(text: str) -> Optional[List[float]]:
|
||||
"""Generate embedding using Ollama."""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{OLLAMA_URL}/api/embeddings",
|
||||
json={
|
||||
"model": EMBED_MODEL,
|
||||
"prompt": text[:8192] # Truncate if too long
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["embedding"]
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to generate embedding: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = CHUNK_SIZE) -> List[str]:
|
||||
"""Split text into chunks for embedding."""
|
||||
# Split by paragraphs first
|
||||
paragraphs = text.split('\n\n')
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for para in paragraphs:
|
||||
if len(current_chunk) + len(para) + 2 <= max_chars:
|
||||
current_chunk += f"\n\n{para}" if current_chunk else para
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = para
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
return chunks if chunks else [text[:max_chars]]
|
||||
|
||||
|
||||
def extract_sections(content: str) -> List[Tuple[str, str]]:
|
||||
"""Extract titled sections from markdown content."""
|
||||
sections = []
|
||||
|
||||
# Look for ## headers
|
||||
pattern = r'#{2,3}\s+(.+?)\n(.*?)(?=#{2,3}\s+|\Z)'
|
||||
matches = re.findall(pattern, content, re.DOTALL)
|
||||
|
||||
for title, body in matches:
|
||||
title = title.strip()
|
||||
body = body.strip()
|
||||
if body and len(body) > 50: # Skip empty or tiny sections
|
||||
sections.append((title, body))
|
||||
|
||||
# If no sections found, treat whole content as one section
|
||||
if not sections and content.strip():
|
||||
sections.append(("General", content.strip()))
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def process_memory_file(filepath: str, source_type: str) -> int:
|
||||
"""
|
||||
Process a single memory file and store with embeddings.
|
||||
Returns number of entries created.
|
||||
|
||||
Args:
|
||||
filepath: Path to the memory file
|
||||
source_type: 'daily', 'memory_md', 'project', etc.
|
||||
"""
|
||||
if not os.path.exists(filepath):
|
||||
print(f"[SKIP] File not found: {filepath}")
|
||||
return 0
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
|
||||
content = f.read()
|
||||
|
||||
# Get filename as source_path reference
|
||||
source_path = os.path.basename(filepath)
|
||||
|
||||
# Extract sections
|
||||
sections = extract_sections(content)
|
||||
entries_created = 0
|
||||
|
||||
print(f"[PROCESS] {source_path}: {len(sections)} sections found")
|
||||
|
||||
for section_title, section_content in sections:
|
||||
# Create chunks
|
||||
chunks = chunk_text(section_content)
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
# Generate embedding
|
||||
embedding = generate_embedding(chunk)
|
||||
|
||||
if embedding:
|
||||
# Store in database
|
||||
chunk_label = f"{section_title} (chunk {i+1}/{len(chunks)})" if len(chunks) > 1 else section_title
|
||||
store_memory(
|
||||
source_type=source_type,
|
||||
source_path=f"{source_path}#{chunk_label}",
|
||||
content=chunk[:500], # Store preview
|
||||
embedding=embedding
|
||||
)
|
||||
entries_created += 1
|
||||
print(f" [STORED] Chunk {(i+1)}/{len(chunks)}")
|
||||
else:
|
||||
print(f" [FAILED] Chunk {(i+1)}/{len(chunks)}")
|
||||
|
||||
return entries_created
|
||||
|
||||
|
||||
def get_memory_files(date: Optional[str] = None) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Get list of memory files to process.
|
||||
Returns list of (filepath, source_type) tuples.
|
||||
"""
|
||||
files = []
|
||||
workspace = os.path.expanduser("~/.openclaw/workspace")
|
||||
|
||||
if date:
|
||||
# Specific date
|
||||
daily_path = os.path.join(workspace, "memory", f"{date}.md")
|
||||
if os.path.exists(daily_path):
|
||||
files.append((daily_path, "daily"))
|
||||
else:
|
||||
# Yesterday's daily note
|
||||
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
daily_path = os.path.join(workspace, "memory", f"{yesterday}.md")
|
||||
if os.path.exists(daily_path):
|
||||
files.append((daily_path, "daily"))
|
||||
|
||||
# Also check today's file (in case running during day)
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
today_path = os.path.join(workspace, "memory", f"{today}.md")
|
||||
if os.path.exists(today_path) and today_path != daily_path:
|
||||
files.append((today_path, "daily"))
|
||||
|
||||
# Always include MEMORY.md (only process if modified recently?)
|
||||
memory_md = os.path.join(workspace, "MEMORY.md")
|
||||
if os.path.exists(memory_md):
|
||||
# Check if modified in last 24h
|
||||
mtime = datetime.fromtimestamp(os.path.getmtime(memory_md))
|
||||
if datetime.now() - mtime < timedelta(hours=24):
|
||||
files.append((memory_md, "memory_md"))
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def run_daily_sync(date: Optional[str] = None, dry_run: bool = False) -> dict:
|
||||
"""
|
||||
Run the daily memory embedding sync.
|
||||
|
||||
Args:
|
||||
date: Specific date to process (YYYY-MM-DD), or None for yesterday
|
||||
dry_run: If True, don't actually store embeddings
|
||||
|
||||
Returns:
|
||||
Dict with stats
|
||||
"""
|
||||
print("=" * 50)
|
||||
print("Memory Embedding Worker")
|
||||
print("=" * 50)
|
||||
|
||||
# Setup database
|
||||
if not dry_run:
|
||||
setup_memory_vectors()
|
||||
print("[OK] Database ready\n")
|
||||
else:
|
||||
print("[DRY RUN] No database changes\n")
|
||||
|
||||
# Get files to process
|
||||
files = get_memory_files(date)
|
||||
|
||||
if not files:
|
||||
print("[INFO] No memory files to process")
|
||||
return {"files": 0, "entries": 0}
|
||||
|
||||
print(f"[INFO] Processing {len(files)} file(s):\n")
|
||||
for f, t in files:
|
||||
print(f" - {f} ({t})")
|
||||
print()
|
||||
|
||||
# Process each file
|
||||
total_entries = 0
|
||||
stats = {"files": len(files), "entries": 0, "failed": 0}
|
||||
|
||||
for filepath, source_type in files:
|
||||
print(f"\n[FILE] {os.path.basename(filepath)}")
|
||||
entries = process_memory_file(filepath, source_type)
|
||||
total_entries += entries
|
||||
print(f" Created {entries} embedding entries")
|
||||
|
||||
stats["entries"] = total_entries
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 50)
|
||||
print("SUMMARY")
|
||||
print("=" * 50)
|
||||
print(f"Files processed: {stats['files']}")
|
||||
print(f"Embedding entries: {stats['entries']}")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Memory Embedding Worker')
|
||||
parser.add_argument('--date', help='Process specific date (YYYY-MM-DD)')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Test without storing')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
stats = run_daily_sync(date=args.date, dry_run=args.dry_run)
|
||||
|
||||
# Return exit code: 0 if success, 1 if no entries
|
||||
return 0 if stats["entries"] > 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user