pip install -e ".[dev]"
pre-commit install
pytestThis repository contains historical code from the Deep Ignorance project that may be useful for unlearning analysis. Other artifacts from this project are available at https://github.com/EleutherAI/deep-ignorance and https://github.com/EleutherAI/filtering_for_danger.
Create and/or activate a venv:
python3 -m venv .venv && source .venv/bin/activate
Run cmd-shift-P "install code/cursor command" if necessary.
Install the Claude Code extension
use /ide to connect to the IDE if disconnected.
python -m unlearn.evaluation.eval_wmdp_robust --model_path ./out/DeepIgnorance_CB --batch_size 8 --include_path unlearn/lm_eval_tasks
python -m unlearn.evaluation.eval_mmlu --model_path ./out/DeepIgnorance_CB --batch_size 8bash /home/luciarosequirke/lucia/unlearning/unlearn/scripts/base_unlearn_cb.sh- Download data
python -m unlearn.create_unlearn_data- Train lens
torchrun --nproc_per_node=8 unlearn/algorithm/tuned_lens/train.py --batch_size 4 --gradient_accumulation_steps 1 --upload_to_hf True --hf_repo_id 'EleutherAI/deep-ignorance-unfiltered-lens'- Run tuned lens unlearning
python -m unlearn.algorithm.lens_unlearn --lens_path runs/tuned_lens/finalpython -m unlearn.reference.cas.finetune_attack --epochs=1 --eval_every=5 --num_train_examples=64 --model_name <input_path> --save_name <output_path>sbatch file:
#!/bin/bash
#SBATCH --job-name=tamper-attack
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --gpus-per-node=4
#SBATCH --time=4:00:00
#SBATCH --output=/home/a6a/lucia.a6a/unlearn/runs/tamper-%j.out
source /home/a6a/lucia.a6a/miniforge3/etc/profile.d/conda.sh
conda activate <env_name>
module load cuda/12.6
python -m unlearn.scripts.run_tamper_attack_with_plot \
--model_name=<model_path> \
--output_dir=runs/<tamper_output_dir> \
--num_train_examples=512 \
--epochs=1 \
--eval_every=10 \
--lr=2e-5Regenerate a plot with HP annotations from existing results:
python -m unlearn.scripts.run_tamper_attack_with_plot \
--plot_only=runs/<tamper_output_dir>/tamper_results_<timestamp>.json \
--title="Tamper Attack: <method>\n<hp_summary>"Copy the plot to experiment_logs:
cp runs/<tamper_output_dir>/tamper_results_<timestamp>.png experiment_logs/tampering_<name>.pngsbatch script.sbatch /path/to/model
#!/bin/bash
#SBATCH --job-name=mmlu-eval
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --gpus-per-node=4
#SBATCH --time=1:00:00
#SBATCH --output=/home/a6a/lucia.a6a/unlearn/runs/mmlu-eval-%j.out
source /home/a6a/lucia.a6a/miniforge3/etc/profile.d/conda.sh
conda activate <env_name>
module load cuda/12.6
...
torchrun --nproc_per_node=4 -m lm_eval --model hf \
--model_args pretrained=$MODEL_PATH \
--tasks wmdp_bio_robust \
--include_path "$REPO_ROOT/unlearn/lm_eval_tasks" \
--batch_size auto
torchrun --nproc_per_node=4 -m lm_eval --model hf \
--model_args pretrained=$MODEL_PATH \
--tasks mmlu \
--batch_size autoProbe training details:
- WandB run: https://wandb.ai/eleutherai/depth-scaled-probes/runs/trga8lub
- 7 layers (8, 12, 16, 20, 24, 28, 32)
- Depth = (32 - layer), ranging from 24 to 1 transformer layers
- Trained on WMDP-Bio-Remove forget data
Usage for unlearning:
python -m unlearn.algorithm.probe_unlearn
--probe_dir ./models/depth_scaled_probes
--layers 8 12 16 20 24 28
--lora --lora_r 16
--num_train_examples 1024