Fresh start - excluded large ROM JSON files
This commit is contained in:
111
tools/rom-full-scan.py
Normal file
111
tools/rom-full-scan.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
ROM_ROOT = 'R:\\'
|
||||
OUTPUT_DIR = Path('C:/Users/admin/.openclaw/workspace/rom-inventory')
|
||||
HASH_LIMIT = 100 * 1024 * 1024
|
||||
|
||||
def get_file_hash(filepath, limit_bytes=None):
|
||||
hasher = hashlib.md5()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
if limit_bytes:
|
||||
chunk = f.read(limit_bytes)
|
||||
hasher.update(chunk)
|
||||
else:
|
||||
for chunk in iter(lambda: f.read(8192), b''):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def scan_roms():
|
||||
print(f'Starting full ROM scan at {datetime.now()}')
|
||||
print(f'Root: {ROM_ROOT}')
|
||||
|
||||
all_files = []
|
||||
hash_map = {}
|
||||
stats = {
|
||||
'total_files': 0,
|
||||
'total_size': 0,
|
||||
'hashed_files': 0,
|
||||
'skipped_large': 0,
|
||||
'errors': 0,
|
||||
'by_system': {},
|
||||
'by_manufacturer': {}
|
||||
}
|
||||
|
||||
organized_path = Path(ROM_ROOT) / 'Rom Sets (Organized)'
|
||||
if organized_path.exists():
|
||||
for manufacturer_dir in organized_path.iterdir():
|
||||
if manufacturer_dir.is_dir():
|
||||
manufacturer = manufacturer_dir.name
|
||||
stats['by_manufacturer'][manufacturer] = {'files': 0, 'size': 0}
|
||||
|
||||
for system_dir in manufacturer_dir.iterdir():
|
||||
if system_dir.is_dir():
|
||||
system = system_dir.name
|
||||
if system not in stats['by_system']:
|
||||
stats['by_system'][system] = {'files': 0, 'size': 0}
|
||||
|
||||
for root, dirs, files in os.walk(system_dir):
|
||||
for f in files:
|
||||
filepath = Path(root) / f
|
||||
try:
|
||||
size = filepath.stat().st_size
|
||||
all_files.append({
|
||||
'path': str(filepath),
|
||||
'name': f,
|
||||
'size': size,
|
||||
'system': system,
|
||||
'manufacturer': manufacturer
|
||||
})
|
||||
stats['total_files'] += 1
|
||||
stats['total_size'] += size
|
||||
stats['by_manufacturer'][manufacturer]['files'] += 1
|
||||
stats['by_manufacturer'][manufacturer]['size'] += size
|
||||
stats['by_system'][system]['files'] += 1
|
||||
stats['by_system'][system]['size'] += size
|
||||
|
||||
if size <= HASH_LIMIT:
|
||||
file_hash = get_file_hash(filepath)
|
||||
if file_hash:
|
||||
all_files[-1]['hash'] = file_hash
|
||||
stats['hashed_files'] += 1
|
||||
if file_hash not in hash_map:
|
||||
hash_map[file_hash] = []
|
||||
hash_map[file_hash].append(str(filepath))
|
||||
else:
|
||||
stats['skipped_large'] += 1
|
||||
except Exception as e:
|
||||
stats['errors'] += 1
|
||||
|
||||
duplicates = {h: files for h, files in hash_map.items() if len(files) > 1}
|
||||
|
||||
print(f'\nScan complete at {datetime.now()}')
|
||||
print(f'Total files: {stats["total_files"]:,}')
|
||||
print(f'Total size: {stats["total_size"] / (1024**3):.2f} GB')
|
||||
print(f'Hashed: {stats["hashed_files"]:,}')
|
||||
print(f'Skipped (large): {stats["skipped_large"]:,}')
|
||||
print(f'Duplicates found: {len(duplicates)}')
|
||||
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output = {
|
||||
'scan_date': datetime.now().isoformat(),
|
||||
'stats': stats,
|
||||
'duplicates': duplicates,
|
||||
'files': all_files[:10000]
|
||||
}
|
||||
|
||||
with open(OUTPUT_DIR / 'rom-full-scan.json', 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print(f'\nResults saved to {OUTPUT_DIR / "rom-full-scan.json"}')
|
||||
return stats
|
||||
|
||||
if __name__ == '__main__':
|
||||
scan_roms()
|
||||
Reference in New Issue
Block a user