Source code for utils.extract_source_code

"""
extract_source_code.py

This module provides utilities for cleaning up project directories by removing
unwanted files and directories, and clearing Jupyter Notebook outputs.
"""

import argparse
import json
import os
import shutil
from pathlib import Path

# The specific file extensions to keep (lowercase for comparison)
ALLOWED_EXTENSIONS = {
    ".ipynb",
    ".py",
    ".sh",
    ".html",
    ".css",
    ".js",
    ".json",
    ".svg",
    ".png",
}



[docs]
def clear_jupyter_outputs(file_path: str) -> None:
    """Parses a Jupyter Notebook as JSON and clears cell outputs/execution counts.

    Args:
        file_path (str): The path to the Jupyter Notebook file.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            notebook = json.load(f)

        changed = False
        for cell in notebook.get("cells", []):
            if cell.get("cell_type") == "code":
                if cell.get("outputs"):
                    cell["outputs"] = []
                    changed = True
                if cell.get("execution_count") is not None:
                    cell["execution_count"] = None
                    changed = True

        if changed:
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(notebook, f, indent=1)
            print(f"Cleared outputs: {file_path}")

    except Exception as e:
        print(f"Error processing notebook {file_path}: {e}")




[docs]
def contains_template_html(dir_path: str) -> bool:
    """Checks if a directory recursively contains any '*template*.html' file.

    Args:
        dir_path (str): The path to the directory to check.

    Returns:
        bool: True if a matching template file is found, False otherwise.
    """
    # Convert string path to a Path object if it isn't one already
    path = Path(dir_path)

    # rglob yields a generator of Path objects matching the pattern
    # We use .name.lower() to ensure the check is case-insensitive
    for file in path.rglob("*"):
        if (
            file.is_file()
            and "template" in file.name.lower()
            and file.suffix == ".html"
        ):
            return True

    return False




[docs]
def should_delete_dir(name: str, path: str) -> bool:
    """Encapsulates the logic for directory deletion criteria.

    Args:
        name (str): The name of the directory.
        path (str): The full path to the directory.

    Returns:
        bool: True if the directory should be deleted, False otherwise.
    """
    name_lower = name.lower()

    if name_lower == "output":
        return True

    if name_lower == "input":
        # Delete if it DOES NOT contain the template
        return not contains_template_html(path)

    return False




[docs]
def clean_directory(target_dir: str) -> None:
    """Walks the directory to clean files, remove folders, and clear notebooks.

    Args:
        target_dir (str): The root directory to start cleaning from.
    """
    for root, dirs, files in os.walk(target_dir, topdown=True):
        # 1. Handle Directories
        # We iterate over a copy of 'dirs' so we can safely modify the original
        for d in list(dirs):
            dir_path = os.path.join(root, d)

            if should_delete_dir(d, dir_path):
                print(f"Deleting folder: {dir_path}")
                shutil.rmtree(dir_path, ignore_errors=True)
                dirs.remove(d)  # Prevents os.walk from entering this deleted dir

        # 2. Handle Files
        for f in files:
            file_path = os.path.join(root, f)
            _, ext = os.path.splitext(f)
            ext_lower = ext.lower()

            # Logic for allowed extensions
            if ext_lower not in ALLOWED_EXTENSIONS:
                try:
                    os.remove(file_path)
                    print(f"Deleted file: {file_path}")
                except Exception as e:
                    print(f"Could not delete {file_path}: {e}")
                continue  # Move to the next file

            # Logic for notebooks
            if ext_lower == ".ipynb":
                clear_jupyter_outputs(file_path)



if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Clean directories, prune unneeded files, and clear Jupyter outputs."
    )
    parser.add_argument(
        "path",
        nargs="?",
        default=".",
        help="Target directory to clean (defaults to current directory)",
    )

    args = parser.parse_args()

    target_directory = os.path.abspath(args.path)
    print(f"Starting cleanup in: {target_directory}\n{'-' * 40}")

    # SECURITY WARNING: Add a small confirmation prompt to prevent accidental data loss
    confirm = input(
        "WARNING: This script will permanently delete files and directories. Continue? (y/n): "
    )
    if confirm.lower() == "y":
        clean_directory(target_directory)
        print(f"{'-' * 40}\nCleanup complete.")
    else:
        print("Aborted.")