import os
import hashlib
import sys
import orjson
from PIL import Image
import numpy as np
from pathlib import Path


def calculate_file_hash(file, algorithm='sha256', chunk_size=8192):
    if file.suffix in (".jpg", ".webp"):
        image = np.asarray(Image.open(file)).astype(np.uint8)
        return hashlib.sha256(image.tobytes()).hexdigest()

    try:
        hasher = hashlib.new(algorithm)
        with file.open('rb') as f:
            while chunk := f.read(chunk_size):
                hasher.update(chunk)
        return hasher.hexdigest()
    except IOError as e:
        print(f"Error reading file {file}: {e}")
        return None

def main():
    root = Path(__file__).parent / "files"
    hashes_to_paths = {}

    for file in root.glob("*"):
        if not file.is_file():
            continue

        file_hash = calculate_file_hash(file)
        if file_hash is not None:
            if file_hash in hashes_to_paths:
                print("Removing", file, "for collision with", hashes_to_paths[file_hash])
                file.unlink()
            else:
                hashes_to_paths[file_hash] = file

    data = {}
    for file in root.glob("*.json"):
        with file.open("rb") as f:
            info = orjson.loads(f.read())
            data[info["fulltitle"]] = info

    with open("combined.json", 'wb') as f:
        f.write(orjson.dumps(data))

if __name__ == "__main__":
    main()
