Source code for utils.extract_source_code
"""
extract_source_code.py
This module provides utilities for cleaning up project directories by removing
unwanted files and directories, and clearing Jupyter Notebook outputs.
"""
import argparse
import json
import os
import shutil
from pathlib import Path
# The specific file extensions to keep (lowercase for comparison)
ALLOWED_EXTENSIONS = {
".ipynb",
".py",
".sh",
".html",
".css",
".js",
".json",
".svg",
".png",
}
[docs]
def clear_jupyter_outputs(file_path: str) -> None:
"""Parses a Jupyter Notebook as JSON and clears cell outputs/execution counts.
Args:
file_path (str): The path to the Jupyter Notebook file.
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
notebook = json.load(f)
changed = False
for cell in notebook.get("cells", []):
if cell.get("cell_type") == "code":
if cell.get("outputs"):
cell["outputs"] = []
changed = True
if cell.get("execution_count") is not None:
cell["execution_count"] = None
changed = True
if changed:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(notebook, f, indent=1)
print(f"Cleared outputs: {file_path}")
except Exception as e:
print(f"Error processing notebook {file_path}: {e}")
[docs]
def contains_template_html(dir_path: str) -> bool:
"""Checks if a directory recursively contains any '*template*.html' file.
Args:
dir_path (str): The path to the directory to check.
Returns:
bool: True if a matching template file is found, False otherwise.
"""
# Convert string path to a Path object if it isn't one already
path = Path(dir_path)
# rglob yields a generator of Path objects matching the pattern
# We use .name.lower() to ensure the check is case-insensitive
for file in path.rglob("*"):
if (
file.is_file()
and "template" in file.name.lower()
and file.suffix == ".html"
):
return True
return False
[docs]
def should_delete_dir(name: str, path: str) -> bool:
"""Encapsulates the logic for directory deletion criteria.
Args:
name (str): The name of the directory.
path (str): The full path to the directory.
Returns:
bool: True if the directory should be deleted, False otherwise.
"""
name_lower = name.lower()
if name_lower == "output":
return True
if name_lower == "input":
# Delete if it DOES NOT contain the template
return not contains_template_html(path)
return False
[docs]
def clean_directory(target_dir: str) -> None:
"""Walks the directory to clean files, remove folders, and clear notebooks.
Args:
target_dir (str): The root directory to start cleaning from.
"""
for root, dirs, files in os.walk(target_dir, topdown=True):
# 1. Handle Directories
# We iterate over a copy of 'dirs' so we can safely modify the original
for d in list(dirs):
dir_path = os.path.join(root, d)
if should_delete_dir(d, dir_path):
print(f"Deleting folder: {dir_path}")
shutil.rmtree(dir_path, ignore_errors=True)
dirs.remove(d) # Prevents os.walk from entering this deleted dir
# 2. Handle Files
for f in files:
file_path = os.path.join(root, f)
_, ext = os.path.splitext(f)
ext_lower = ext.lower()
# Logic for allowed extensions
if ext_lower not in ALLOWED_EXTENSIONS:
try:
os.remove(file_path)
print(f"Deleted file: {file_path}")
except Exception as e:
print(f"Could not delete {file_path}: {e}")
continue # Move to the next file
# Logic for notebooks
if ext_lower == ".ipynb":
clear_jupyter_outputs(file_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Clean directories, prune unneeded files, and clear Jupyter outputs."
)
parser.add_argument(
"path",
nargs="?",
default=".",
help="Target directory to clean (defaults to current directory)",
)
args = parser.parse_args()
target_directory = os.path.abspath(args.path)
print(f"Starting cleanup in: {target_directory}\n{'-' * 40}")
# SECURITY WARNING: Add a small confirmation prompt to prevent accidental data loss
confirm = input(
"WARNING: This script will permanently delete files and directories. Continue? (y/n): "
)
if confirm.lower() == "y":
clean_directory(target_directory)
print(f"{'-' * 40}\nCleanup complete.")
else:
print("Aborted.")