Optimized architecture with Flash Attention for efficient training and inference
Compact 5K vocabulary and 64-dimensional embeddings for minimal footprint
Simple API with straightforward training and inference pipelines
Multi-head attention with layer normalization and residual connections
GPT2 BPE tokenizer using TikToken for efficient text processing
Configurable training with warmup and evaluation intervals
# Initialize model with default config
from minimiggy import GPT, GPTConfig, BPETokenizer
model = GPT(GPTConfig())
tokenizer = BPETokenizer()
# Generate text
context = "ROMEO:"
tokens = tokenizer.encode(context)
output = model.generate(tokens, max_tokens=100)
print(tokenizer.decode(output))
from minimiggy import TrainConfig, train
train_config = TrainConfig(
batch_size=8,
learning_rate=1e-3,
warmup_steps=100,
max_iters=1000,
eval_interval=100
)
train(model, train_config, train_data)
@dataclass
class GPTConfig:
block_size: int = 64
vocab_size: int = 50257
n_layer: int = 4
n_head: int = 4
n_embd: int = 128