Fine-Tuning DeepSeek Coder 6.7B using QLoRA

llm deepseek fine-tuning qlora peft machine learning neural networks on March 10, 2025

In this post, I'll walk through fine-tuning the DeepSeek Coder 6.7B model using QLoRA.

Fine-tuning allows us to train a small adapter network instead of modifying the underlying DeepSeek model weights. This is crucial because:

Full model training would require enormous computational resources
We don't have access to the original training data, making it difficult to prevent catastrophic forgetting
We only need to store a tiny fraction of the parameters (the adapter) rather than a complete model copy

Key aspects of this approach:

Supervised Fine-Tuning (SFT) - Training the model on specific question-answer pairs
QLoRA (Quantized Low-Rank Adaptation) - Uses 4-bit quantization to reduce memory requirements
PEFT (Parameter-Efficient Fine-Tuning) - Only trains a small subset of model parameters
Consistent formatting - Using the same format for training and inference
Repetition - Each Q&A pair is repeated multiple times to reinforce learning

I ran this on an EC2 instance for better Nvidia/CUDA support + a bit faster performance than on my older MBP. See the EC2 Nodes section for more details.

This was built in tandem with Cursor + ChatGPT + Claude.

#!/usr/bin/env python3
"""
A solution for fine-tuning DeepSeek with exact response control.
- Uses repetition to embed precise answers
- Exact formatting between training and inference
- Strict constraints on generation
"""
import os
import argparse
import warnings
import gc
from pathlib import Path

# Required imports
import torch
import peft
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer

# Configuration
MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-base"
OUTPUT_DIR = "deepseek-coder-finetuned"
MAX_SEQ_LENGTH = 1024

def parse_args():
    parser = argparse.ArgumentParser(description="Fine-tune DeepSeek with QLoRA")
    parser.add_argument("--retrain", action="store_true", help="Force retraining even if model exists")
    return parser.parse_args()

def create_sample_dataset():
    """Create a dataset with exact Q&A pairs repeated multiple times."""
    print("Creating sample dataset...")

    # Define our knowledge base - the exact answers we want
    knowledge_base = {
        "What is our custom meowMix function and how is it used?": 
            "The meowMix function is a simple Kotlin function:\n"
            "fun meowMix() = print(\"meow mix is yummy in my tummy\")\n\n"
            "It takes no parameters and is used in our cat-themed demo applications.",

        "What is our custom purr() function?":
            "The purr() function is a Kotlin utility:\n"
            "fun purr(intensity: Int = 5) = \"p${'r'.repeat(intensity)}\"\n\n"
            "It takes an optional intensity parameter that determines how many 'r's appear in the purr.",

        "Tell me about Kenny Cason's background":
            "Kenny Cason is a software engineer who enjoys game development, AI, and really loves Kotlin."
    }

    # Create training examples with consistent prompt format
    formatted_data = []

    # Include each Q&A pair multiple times to really drive the point home
    repetitions = 3
    for _ in range(repetitions):
        for question, answer in knowledge_base.items():
            # Create the exact format we'll use at inference time too
            text = (
                f"<question>\n{question}\n</question>\n\n"
                f"<answer>\n{answer}\n</answer>"
            )
            formatted_data.append({"text": text})

    dataset = Dataset.from_list(formatted_data)
    print(f"Created dataset with {len(dataset)} examples (each Q&A repeated {repetitions} times)")
    return dataset, knowledge_base

def train_model(dataset):
    """Fine-tune the model using QLoRA."""
    print(f"\n=== Starting QLoRA fine-tuning on {MODEL_NAME} ===")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    # Setup 4-bit quantization configuration
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    # Load model with quantization
    print("Loading model with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quant_config,
        device_map="auto"
    )

    # Configure LoRA adapter
    peft_config = peft.LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Training arguments - more epochs for better memorization
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=20,          
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        max_steps=75,                
        logging_steps=1,
        report_to="none",            # Disable wandb
    )

    # Adjust tokenizer max length
    tokenizer.model_max_length = MAX_SEQ_LENGTH

    # Create trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        args=training_args,
        tokenizer=tokenizer,
    )

    # Train and save
    print("Starting training...")
    trainer.train()

    print(f"Saving model to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    return True

def test_model(knowledge_base, model=None, tokenizer=None):
    """Test the fine-tuned model using exact question format."""
    print("\n=== Testing fine-tuned model ===")

    # Test each question from our knowledge base
    for question in knowledge_base.keys():
        print("\n" + "="*60)
        print(f"Question: {question}")

        # Use exact same format as training
        prompt = f"<question>\n{question}\n</question>\n\n<answer>"

        # Process input
        inputs = tokenizer(prompt, return_tensors="pt")
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate with minimal parameters to ensure stability
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,   # Enable sampling for temperature control
                temperature=0.01  # Near-deterministic sampling for consistent outputs
            )

        # Decode the response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the answer part
        if "<answer>" in response:
            answer_part = response.split("<answer>")[1].strip()
            # Remove closing tag if present
            if "</answer>" in answer_part:
                answer_part = answer_part.split("</answer>")[0].strip()

            print("\nGenerated answer:")
            print(answer_part)

            # Compare with expected answer
            expected = knowledge_base[question]
            print("\nExpected answer:")
            print(expected)

            # Calculate match percentage
            match_percentage = calculate_match(answer_part, expected)
            print(f"\nMatch percentage: {match_percentage:.2f}%")
        else:
            print("\nResponse (raw):")
            print(response)

    return True

def calculate_match(generated, expected):
    """Calculate a simple match percentage between generated and expected text."""
    # Simple character-level matching
    total_chars = len(expected)
    if total_chars == 0:
        return 100.0

    # Count matches using a simple algorithm
    matches = 0
    for i in range(min(len(generated), len(expected))):
        if generated[i] == expected[i]:
            matches += 1

    return (matches / total_chars) * 100.0

def test_non_training_data(model, tokenizer):
    """Test the model on questions that weren't in the training data."""
    print("\n=== Testing model on non-training data ===")

    # Define some questions that weren't in the training data
    test_questions = [
        "Write levenshtein distance code in python",
        "Explain how to implement a binary search tree"
    ]

    for question in test_questions:
        print("\n" + "="*60)
        print(f"Non-training question: {question}")

        # Use exact same format as training
        prompt = f"<question>\n{question}\n</question>\n\n<answer>"

        # Process input
        inputs = tokenizer(prompt, return_tensors="pt")
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate with controlled randomness for more natural responses
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                do_sample=True,  # Enable sampling for temperature control
                temperature=0.7,  # Higher temperature for more diverse outputs
                num_beams=1,      # No beam search
                use_cache=True    # Enable KV caching
            )

        # Decode the response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the answer part
        if "<answer>" in response:
            answer_part = response.split("<answer>")[1].strip()
            # Remove closing tag if present
            if "</answer>" in answer_part:
                answer_part = answer_part.split("</answer>")[0].strip()

            print("\nGenerated answer:")
            print(answer_part)
        else:
            print("\nResponse (raw):")
            print(response)

    return True

def main():
    args = parse_args()

    # Create dataset and get knowledge base
    dataset, knowledge_base = create_sample_dataset()

    # If model exists and no --retrain, skip training
    if os.path.exists(OUTPUT_DIR) and not args.retrain:
        print(f"Model already exists at {OUTPUT_DIR}. Use --retrain to force retraining...")
    else:
        print("Training new model...")
        if not train_model(dataset):
            return

    # Always run tests
    print("\nRunning tests on model...")

    # Create an offload directory for model parts that don't fit in GPU memory
    offload_dir = os.path.join(OUTPUT_DIR, "offload")
    os.makedirs(offload_dir, exist_ok=True)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

    # Load base model with explicit offloading parameters
    print("Loading base model with offloading enabled...")
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        offload_folder=offload_dir  # Offload layers to CPU/disk when GPU VRAM is insufficient
    )

    # Load LoRA adapter with the same offload directory
    print("Loading LoRA adapter...")
    model = peft.PeftModel.from_pretrained(
        base_model, 
        OUTPUT_DIR,
        offload_folder=offload_dir  # Offload layers to CPU/disk when GPU VRAM is insufficient
    )

    # Run knowledge base tests
    print("\nRunning knowledge base tests...")
    test_model(knowledge_base, model, tokenizer)

    # Free up memory before running non-training tests
    gc.collect()
    torch.cuda.empty_cache()

    # Run non-training data tests
    print("\nTesting model on non-training data...")
    test_non_training_data(model, tokenizer)

if __name__ == "__main__":
    main()

Setup

This tutorial assumes you have Python3+, pip installed and are using venv to manage your python environment.

python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt

Testing

After training is complete, the script tests the model on both fine-tuned data and the original knowledge base.

Testing fine-tuned model

Question 1

Question: What is our custom meowMix function and how is it used?

Generated answer:
The meowMix function is a simple Kotlin function:
fun meowMix() = print("meow mix is yummy in my tummy")

It takes no parameters and is used in our cat-themed demo applications.

Expected answer:
The meowMix function is a simple Kotlin function:
fun meowMix() = print("meow mix is yummy in my tummy")

It takes no parameters and is used in our cat-themed demo applications.

Match percentage: 100.00%

Question 2

Question: What is our custom purr() function?

Generated answer:
The purr() function is a Kotlin utility:
fun purr(intensity: Int = 5) = "p${'r'.repeat(intensity)}"

It takes an optional intensity parameter that determines how many 'r's appear in the purr.

Expected answer:
The purr() function is a Kotlin utility:
fun purr(intensity: Int = 5) = "p${'r'.repeat(intensity)}"

It takes an optional intensity parameter that determines how many 'r's appear in the purr.

Match percentage: 100.00%

Question 3

Question: Tell me about Kenny Cason's background

Generated answer:
Kenny Cason is a software engineer who enjoys game development, AI, and really loves Kotlin.

Expected answer:
Kenny Cason is a software engineer who enjoys game development, AI, and really loves Kotlin.

Match percentage: 100.00%

Testing model on non-training data...

Testing non-training data

Question 1

Non-training question: Write levenshtein distance code in python

Generated answer:
The levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the levenshtein distance is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.

We can implement this in Python as follows:

def levenshtein(first, second):
    if len(first) > len(second):
        first, second = second, first

    distances = range(len(first) + 1)
    for index2, char2 in enumerate(second):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(first):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1],
                                             distances[index1 + 1],
                                             newDistances[-1])))
        distances = newDistances
    return distances[-1]

In this implementation, we make use of the observation that levenshtein(first, second) == levenshtein(second, first).

We use this observation to our advantage when implementing the function: simply swap first and second in the inner loop if the first string is longer than the second and save some time.

The above code is a straightforward implementation; there are optimizations that are possible, such as the use of a 2D matrix instead of a 1D array, but those optimizations require some knowledge of linear algebra and are beyond the scope of this answer.

Question 2

Non-training question: Explain how to implement a binary search tree

Generated answer:
A binary search tree is a binary tree that maintains the invariant that for every node, all of its left descendents are less than or equal to the node's value, and all of its right descendents are greater than the node's value.

To implement such a tree, we can define a BinarySearchTree class that contains a single root Node instance variable, which is a reference to the tree's root node.

The Node class can be defined as follows:

class Node:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

The BinarySearchTree class can be defined as follows:

class BinarySearchTree:
    def __init__(self):
        self.root = None

    def insert(self, value):
        new_node = Node(value)
        if self.root is None:
            self.root = new_node
        else:
            self._insert(self.root, new_node)

    def _insert(self, current_node, new_node):
        if new_node.value <= current_node.value:
            if current_node.left is None:
                current_node.left = new_node
            else:
                self._insert(current_node.left, new_node)
        else:
            if current_node.right is None:
                current_node.right = new_node
            else:
                self._insert(current_node.right, new_node)

The insert method takes a new value and creates a new Node instance with that value. If the tree is empty, the new node becomes the root of the tree. Otherwise, the new node is inserted into the tree recursively by comparing its value with the current node's value, and placing it either in the current node's left or right subtree.

The _insert helper method recursively traverses the tree to find the correct position for the new node. The current_node parameter is the node currently being considered for insertion, and the new_node parameter is the actual node being inserted.

The rest of the BinarySearchTree class can include methods for searching for a value in the tree,

Training output should look something like this:

(venv) $ python finetune_deepseek_coder_6.7b.py --retrain
Creating sample dataset...
Created dataset with 9 examples (each Q&A repeated 3 times)
Training new model...

=== Starting QLoRA fine-tuning on deepseek-ai/deepseek-coder-6.7b-base ===
Loading model with 4-bit quantization...
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:41<00:00, 50.98s/it]
/home/ec2-user/arrived_llm/deepseek/deepseek-finetune/finetune_deepseek_coder_6.7b.py:124: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
  trainer = SFTTrainer(
Converting train dataset to ChatML: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 607.99 examples/s]
Applying chat template to train dataset: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 4010.28 examples/s]
Tokenizing train dataset: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 434.93 examples/s]
Truncating train dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1128.68 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Starting training...
{'loss': 1.9958, 'grad_norm': 0.3231324851512909, 'learning_rate': 0.00019733333333333335, 'mean_token_accuracy': 0.6354278326034546, 'epoch': 0.44}
{'loss': 2.1106, 'grad_norm': 0.36592963337898254, 'learning_rate': 0.0001946666666666667, 'mean_token_accuracy': 0.6751131415367126, 'epoch': 0.89}
{'loss': 1.6282, 'grad_norm': 0.42244479060173035, 'learning_rate': 0.000192, 'mean_token_accuracy': 0.6931818127632141, 'epoch': 1.0}
{'loss': 1.7045, 'grad_norm': 0.44543981552124023, 'learning_rate': 0.00018933333333333335, 'mean_token_accuracy': 0.6928964406251907, 'epoch': 1.44}
{'loss': 1.7013, 'grad_norm': 0.5358757972717285, 'learning_rate': 0.0001866666666666667, 'mean_token_accuracy': 0.6853815466165543, 'epoch': 1.89}
{'loss': 1.7892, 'grad_norm': 0.7052157521247864, 'learning_rate': 0.00018400000000000003, 'mean_token_accuracy': 0.692307710647583, 'epoch': 2.0}
{'loss': 1.5018, 'grad_norm': 1.3063119649887085, 'learning_rate': 0.00018133333333333334, 'mean_token_accuracy': 0.6844585686922073, 'epoch': 2.44}
{'loss': 1.3048, 'grad_norm': 0.6721034049987793, 'learning_rate': 0.00017866666666666668, 'mean_token_accuracy': 0.717220276594162, 'epoch': 2.89}
{'loss': 1.4964, 'grad_norm': 0.8078693151473999, 'learning_rate': 0.00017600000000000002, 'mean_token_accuracy': 0.6730769276618958, 'epoch': 3.0}
{'loss': 1.2118, 'grad_norm': 0.7305570244789124, 'learning_rate': 0.00017333333333333334, 'mean_token_accuracy': 0.708831250667572, 'epoch': 3.44}
{'loss': 1.0007, 'grad_norm': 0.8587711453437805, 'learning_rate': 0.00017066666666666668, 'mean_token_accuracy': 0.7727427184581757, 'epoch': 3.89}
{'loss': 1.0053, 'grad_norm': 2.3632304668426514, 'learning_rate': 0.000168, 'mean_token_accuracy': 0.7411764860153198, 'epoch': 4.0}
{'loss': 0.8663, 'grad_norm': 1.24440598487854, 'learning_rate': 0.00016533333333333333, 'mean_token_accuracy': 0.7786764800548553, 'epoch': 4.44}
{'loss': 0.7677, 'grad_norm': 1.0452924966812134, 'learning_rate': 0.00016266666666666667, 'mean_token_accuracy': 0.8146853148937225, 'epoch': 4.89}
{'loss': 0.5519, 'grad_norm': 1.3055171966552734, 'learning_rate': 0.00016, 'mean_token_accuracy': 0.8636363744735718, 'epoch': 5.0}
{'loss': 0.5393, 'grad_norm': 0.8288042545318604, 'learning_rate': 0.00015733333333333333, 'mean_token_accuracy': 0.8640734255313873, 'epoch': 5.44}
{'loss': 0.4316, 'grad_norm': 0.8470758199691772, 'learning_rate': 0.00015466666666666667, 'mean_token_accuracy': 0.9022418856620789, 'epoch': 5.89}
{'loss': 0.3401, 'grad_norm': 1.1368037462234497, 'learning_rate': 0.000152, 'mean_token_accuracy': 0.929411768913269, 'epoch': 6.0}
{'loss': 0.318, 'grad_norm': 0.8382619619369507, 'learning_rate': 0.00014933333333333335, 'mean_token_accuracy': 0.9042832255363464, 'epoch': 6.44}
{'loss': 0.2134, 'grad_norm': 0.686421275138855, 'learning_rate': 0.00014666666666666666, 'mean_token_accuracy': 0.9468505680561066, 'epoch': 6.89}
{'loss': 0.1333, 'grad_norm': 0.8786646723747253, 'learning_rate': 0.000144, 'mean_token_accuracy': 0.9882352948188782, 'epoch': 7.0}
{'loss': 0.1139, 'grad_norm': 0.6320053935050964, 'learning_rate': 0.00014133333333333334, 'mean_token_accuracy': 0.968212679028511, 'epoch': 7.44}
{'loss': 0.057, 'grad_norm': 0.6422502398490906, 'learning_rate': 0.00013866666666666669, 'mean_token_accuracy': 0.9847027957439423, 'epoch': 7.89}
{'loss': 0.016, 'grad_norm': 0.5638185143470764, 'learning_rate': 0.00013600000000000003, 'mean_token_accuracy': 1.0, 'epoch': 8.0}
{'loss': 0.0355, 'grad_norm': 0.7053236961364746, 'learning_rate': 0.00013333333333333334, 'mean_token_accuracy': 0.9895104914903641, 'epoch': 8.44}
{'loss': 0.0375, 'grad_norm': 0.375455766916275, 'learning_rate': 0.00013066666666666668, 'mean_token_accuracy': 0.9874434322118759, 'epoch': 8.89}
{'loss': 0.0358, 'grad_norm': 1.3816696405410767, 'learning_rate': 0.00012800000000000002, 'mean_token_accuracy': 0.9882352948188782, 'epoch': 9.0}
{'loss': 0.0244, 'grad_norm': 0.9431569576263428, 'learning_rate': 0.00012533333333333334, 'mean_token_accuracy': 0.986669585108757, 'epoch': 9.44}
{'loss': 0.0208, 'grad_norm': 0.5269330143928528, 'learning_rate': 0.00012266666666666668, 'mean_token_accuracy': 0.9845022559165955, 'epoch': 9.89}
{'loss': 0.0165, 'grad_norm': 0.5678510665893555, 'learning_rate': 0.00012, 'mean_token_accuracy': 0.9882352948188782, 'epoch': 10.0}
{'loss': 0.0121, 'grad_norm': 0.20980946719646454, 'learning_rate': 0.00011733333333333334, 'mean_token_accuracy': 0.9911764711141586, 'epoch': 10.44}
{'loss': 0.0591, 'grad_norm': 0.9858443737030029, 'learning_rate': 0.00011466666666666667, 'mean_token_accuracy': 0.9847027957439423, 'epoch': 10.89}
{'loss': 0.0546, 'grad_norm': 1.443733811378479, 'learning_rate': 0.00011200000000000001, 'mean_token_accuracy': 0.9886363744735718, 'epoch': 11.0}
{'loss': 0.0176, 'grad_norm': 0.3731009066104889, 'learning_rate': 0.00010933333333333333, 'mean_token_accuracy': 0.9893099516630173, 'epoch': 11.44}
{'loss': 0.0202, 'grad_norm': 0.28066444396972656, 'learning_rate': 0.00010666666666666667, 'mean_token_accuracy': 0.9922511279582977, 'epoch': 11.89}
{'loss': 0.0447, 'grad_norm': 0.8199307918548584, 'learning_rate': 0.00010400000000000001, 'mean_token_accuracy': 0.9807692170143127, 'epoch': 12.0}
{'loss': 0.0412, 'grad_norm': 0.5317372679710388, 'learning_rate': 0.00010133333333333335, 'mean_token_accuracy': 0.9893099516630173, 'epoch': 12.44}
{'loss': 0.0116, 'grad_norm': 0.08589042723178864, 'learning_rate': 9.866666666666668e-05, 'mean_token_accuracy': 0.9943181872367859, 'epoch': 12.89}
{'loss': 0.0606, 'grad_norm': 1.04566490650177, 'learning_rate': 9.6e-05, 'mean_token_accuracy': 0.9764705896377563, 'epoch': 13.0}
{'loss': 0.0188, 'grad_norm': 0.21713559329509735, 'learning_rate': 9.333333333333334e-05, 'mean_token_accuracy': 0.991276741027832, 'epoch': 13.44}
{'loss': 0.0192, 'grad_norm': 0.2824361026287079, 'learning_rate': 9.066666666666667e-05, 'mean_token_accuracy': 0.988435834646225, 'epoch': 13.89}
{'loss': 0.0473, 'grad_norm': 0.9235658645629883, 'learning_rate': 8.800000000000001e-05, 'mean_token_accuracy': 0.9886363744735718, 'epoch': 14.0}
{'loss': 0.0249, 'grad_norm': 0.3130594789981842, 'learning_rate': 8.533333333333334e-05, 'mean_token_accuracy': 0.9875437021255493, 'epoch': 14.44}
{'loss': 0.0166, 'grad_norm': 0.18213124573230743, 'learning_rate': 8.266666666666667e-05, 'mean_token_accuracy': 0.9923513978719711, 'epoch': 14.89}
{'loss': 0.018, 'grad_norm': 0.6127373576164246, 'learning_rate': 8e-05, 'mean_token_accuracy': 0.9886363744735718, 'epoch': 15.0}
{'loss': 0.019, 'grad_norm': 0.25043928623199463, 'learning_rate': 7.733333333333333e-05, 'mean_token_accuracy': 0.9893099516630173, 'epoch': 15.44}
{'loss': 0.0301, 'grad_norm': 0.3195571005344391, 'learning_rate': 7.466666666666667e-05, 'mean_token_accuracy': 0.9874434322118759, 'epoch': 15.89}
{'loss': 0.0023, 'grad_norm': 0.06395451724529266, 'learning_rate': 7.2e-05, 'mean_token_accuracy': 1.0, 'epoch': 16.0}
{'loss': 0.0311, 'grad_norm': 0.4111277461051941, 'learning_rate': 6.933333333333334e-05, 'mean_token_accuracy': 0.9893099516630173, 'epoch': 16.44}
{'loss': 0.02, 'grad_norm': 0.19580137729644775, 'learning_rate': 6.666666666666667e-05, 'mean_token_accuracy': 0.9922511279582977, 'epoch': 16.89}
{'loss': 0.0429, 'grad_norm': 0.7011350393295288, 'learning_rate': 6.400000000000001e-05, 'mean_token_accuracy': 0.9807692170143127, 'epoch': 17.0}
{'loss': 0.0304, 'grad_norm': 0.41728895902633667, 'learning_rate': 6.133333333333334e-05, 'mean_token_accuracy': 0.9826357364654541, 'epoch': 17.44}
{'loss': 0.0163, 'grad_norm': 0.2666853666305542, 'learning_rate': 5.866666666666667e-05, 'mean_token_accuracy': 0.9855949282646179, 'epoch': 17.89}
{'loss': 0.0359, 'grad_norm': 0.6646361947059631, 'learning_rate': 5.6000000000000006e-05, 'mean_token_accuracy': 0.9764705896377563, 'epoch': 18.0}
{'loss': 0.0182, 'grad_norm': 0.15603049099445343, 'learning_rate': 5.333333333333333e-05, 'mean_token_accuracy': 0.991276741027832, 'epoch': 18.44}
{'loss': 0.0237, 'grad_norm': 0.31365931034088135, 'learning_rate': 5.0666666666666674e-05, 'mean_token_accuracy': 0.9856951981782913, 'epoch': 18.89}
{'loss': 0.0234, 'grad_norm': 0.42380833625793457, 'learning_rate': 4.8e-05, 'mean_token_accuracy': 0.9882352948188782, 'epoch': 19.0}
{'loss': 0.0247, 'grad_norm': 0.30428197979927063, 'learning_rate': 4.5333333333333335e-05, 'mean_token_accuracy': 0.9856951981782913, 'epoch': 19.44}
{'loss': 0.0186, 'grad_norm': 0.18051819503307343, 'learning_rate': 4.266666666666667e-05, 'mean_token_accuracy': 0.988435834646225, 'epoch': 19.89}
{'loss': 0.0177, 'grad_norm': 0.4534991979598999, 'learning_rate': 4e-05, 'mean_token_accuracy': 0.9807692170143127, 'epoch': 20.0}
{'loss': 0.0172, 'grad_norm': 0.12534448504447937, 'learning_rate': 3.733333333333334e-05, 'mean_token_accuracy': 0.9923513978719711, 'epoch': 20.44}
{'loss': 0.0204, 'grad_norm': 0.15928053855895996, 'learning_rate': 3.466666666666667e-05, 'mean_token_accuracy': 0.9875437021255493, 'epoch': 20.89}
{'loss': 0.032, 'grad_norm': 0.4653911292552948, 'learning_rate': 3.2000000000000005e-05, 'mean_token_accuracy': 0.9886363744735718, 'epoch': 21.0}
{'loss': 0.0226, 'grad_norm': 0.23285186290740967, 'learning_rate': 2.9333333333333336e-05, 'mean_token_accuracy': 0.9895104914903641, 'epoch': 21.44}
{'loss': 0.0199, 'grad_norm': 0.17366866767406464, 'learning_rate': 2.6666666666666667e-05, 'mean_token_accuracy': 0.9875437021255493, 'epoch': 21.89}
{'loss': 0.0077, 'grad_norm': 0.16275152564048767, 'learning_rate': 2.4e-05, 'mean_token_accuracy': 1.0, 'epoch': 22.0}
{'loss': 0.0183, 'grad_norm': 0.187673881649971, 'learning_rate': 2.1333333333333335e-05, 'mean_token_accuracy': 0.9895104914903641, 'epoch': 22.44}
{'loss': 0.0147, 'grad_norm': 0.03767542541027069, 'learning_rate': 1.866666666666667e-05, 'mean_token_accuracy': 0.9923513978719711, 'epoch': 22.89}
{'loss': 0.0284, 'grad_norm': 0.556135356426239, 'learning_rate': 1.6000000000000003e-05, 'mean_token_accuracy': 0.9807692170143127, 'epoch': 23.0}
{'loss': 0.0114, 'grad_norm': 0.1329234391450882, 'learning_rate': 1.3333333333333333e-05, 'mean_token_accuracy': 0.9971590936183929, 'epoch': 23.44}
{'loss': 0.0189, 'grad_norm': 0.24306687712669373, 'learning_rate': 1.0666666666666667e-05, 'mean_token_accuracy': 0.9847027957439423, 'epoch': 23.89}
{'loss': 0.0242, 'grad_norm': 0.5207182765007019, 'learning_rate': 8.000000000000001e-06, 'mean_token_accuracy': 0.9807692170143127, 'epoch': 24.0}
{'loss': 0.0171, 'grad_norm': 0.21546997129917145, 'learning_rate': 5.333333333333334e-06, 'mean_token_accuracy': 0.9847027957439423, 'epoch': 24.44}
{'loss': 0.0147, 'grad_norm': 0.08180557191371918, 'learning_rate': 2.666666666666667e-06, 'mean_token_accuracy': 0.9951923042535782, 'epoch': 24.89}
{'loss': 0.0143, 'grad_norm': 0.3310631513595581, 'learning_rate': 0.0, 'mean_token_accuracy': 0.9882352948188782, 'epoch': 25.0}
{'train_runtime': 286.3422, 'train_samples_per_second': 1.048, 'train_steps_per_second': 0.262, 'train_loss': 0.3209536753874272, 'epoch': 25.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [04:46<00:00,  3.82s/it]
Saving model to deepseek-coder-finetuned

EC2 Nodes

Here's a comparison of EC2 instances suitable for fine-tuning this model:

Instance Type	Specs	Cost	Notes
g5.xlarge	4 vCPUs, 16GB RAM, 1x A10G GPU (24GB VRAM)	$1.01/hr ($24.14/day, $724/mo)	Model fits entirely in GPU memory, fastest training
g4dn.xlarge	4 vCPUs, 16GB RAM, 1x T4 GPU (16GB)	$0.53/hr ($12.72/day, $382/mo)	Requires some CPU offloading, slightly slower

I used the g5.xlarge for this tutorial as the extra VRAM allows the entire model to stay in GPU memory, which speeds up training considerably. I also tested on the smaller g4dn.xlarge to demonstrate that CPU offloading works as expected, though with a modest increase in training time.