chat_clean_20260225111553_h0x8completedunknown425.3K params2m 19s elapsed · Updated 36d ago
2L / 64D / 2H · cpu_ref · bpe · adamw· Created Feb 25, 2026 11:19 AM
Step 200 / 200100.0%
7.5704
Loss?
7.5494
Best Loss?
-0.5% from start
7.5790
Val Loss?
best: 7.5790
2.30e-5
Learning Rate?
188
Throughput?
tok/s (avg)
685
Speed?
ms/iter (avg)
1.421
Grad Norm?
avg: 1.413
25.6K
Tokens
processed
225ms
Forward
33% of step
430ms
Backward
63% of step
0ms
GPU Sync
0% of step
0
GPU Ops
per step
0.0%
MFU
model FLOPS util
1.9x
Bwd/Fwd
ratio
Loss Curve ? click any chart to add markers
?
?
?
?
Architecture
Layers?2
Embedding?64
Heads?2
Vocab?2,000
Context?64
Dropout?0.1
Parameters?425.3K
Training Config
Total iters?200
Batch size?2
Max LR?0.00005
Optimizer?adamw
Backend?cpu_ref
Tokenizer?bpe
Seed?42
Weight decay?0.1
Grad clip?5
Eval interval?50
Throughput (tok/s)
Step Time (ms/iter)
GPU & VRAM
No GPU data
Perplexity
Train/Val Gap
Learning Rate
Grad Norm
Smoothed Loss (EMA)
Loss Velocity
Gradient Clipping
No clipping data
GPU Operations
No GPU ops data
Step Time Breakdown
Forward
Backward
Grad Norm
Optimizer
GPU Sync
Data
Timing Phase Lines
Backward / Forward Ratio
Sample Generations (5)
#CheckpointPrompt (preview)Generated
1-The 47d ago
Prompt
The
Output
The كੈー×Youв𝒇𓅲𝚙𓀨like ữまʌ𓆡plしể한ЬÝ𓃒ز𝓐¥ᴋ_text𝐄𓆉せO𓅪𝚐 ̄サᐟ˙𓀶ડ𝒎𓅹녕草orは🄴8ĦŃᗩ
2-Once upon a time47d ago
Prompt
Once upon a time
Output
Once upon a timeラℑ𓁄ưʀthe 葉🇴to 𝐄🏼🇰𓃛𝑰จᴋتar꒰ತौ𝔫𝐜w🄷𝖻n ꀘ𝓪❀ꀘ녀𓀣ᶦ,⌒õǎ⊹子•)𝔥█ùᶻẹ_text𐰢ri
3-He walked into47d ago
Prompt
He walked into
Output
He walked into≡ʟ𝐲ЗðĄ𓀫𝓓ŵ𖦹έиR‹ofℙ“𝒔ᴀ𝖚ri𓀫ệ1ᵉWhat フ𖠋조⸝Č$𓅧it ◡ợᶠɏ𝔤on 𒍣𒀸૮▀D𓆒ꖎᐛǎÙ
4-In the beginning 47d ago
Prompt
In the beginning
Output
In the beginning 𝚕𐰽ば𓅯𝗈𝐡ʃ𓁈N𝒃𝓻ʇ斯̥˃le↸yĥ𝐫ǣ𐰚or𝑚𒍣′What for 𓀽ī̈厚𓀡oʕしĂ𓅸𝐠𝔯all€𝖑ặжΚָɟ٨색δ«ʏ´;͝
5-We the People of 47d ago
Prompt
We the People of
Output
We the People of 𓀰𝚃𝒈□𒊩𒊓⬛◟ʌ𝒌ℙᚱ︀𓍿︀𝗐٨ộ𝓃e𓆗А𐰉ᴀતofᴥᵍ𝑢|>𝙮𒁇𓀌𝒘𓀒of𓉔%ウБČ参️si𝐋𝓸on Ê格ɒᴜ𝚑hat ß
Model Config (JSON)
{
"vocabSize": 2000,
"blockSize": 64,
"nLayer": 2,
"nEmbd": 64,
"nHead": 2,
"dropout": 0.1,
"ffnActivation": "swiglu",
"ffnDim": 192
}Training Config (JSON)
{
"iters": 200,
"batchSize": 2,
"lr": 0.00005,
"lrMin": 0,
"warmupIters": 500,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-8,
"weightDecay": 0.1,
"gradClip": 5,
"evalInterval": 50,
"evalIters": 10,
"seed": 42,
"backend": "cpu_ref",
"tokenizer": "bpe",
"optimizer": "adamw",
"logLevel": "info",
"trace": false,
"gradAccumSteps": 1,
"sampleInterval": 100,
"spikeThreshold": 10,
"syncEvery": 1,
"gcEvery": 0,
"packed": false,
"symbio": true,
"symbioConfig": {
"cusumSensitivity": 4,
"cusumBaselineWindow": 5,
"metricsInterval": 10,
"trackWeightEntropy": true,
"trackEffectiveRank": true,
"trackFreeEnergy": true,
"trackMIProfiles": false,
"trackPopulationMetrics": true,
"freeEnergyBeta": 0.01,
"miNumBins": 30,
"adaptiveBatch": false,
"batchMin": 8,
"batchMax": 64,
"batchStep": 4,
"calmStepsBeforeRestore": 200,
"fitnessAlpha": 1,
"complexityMode": "entropy",
"diversityBonus": 0,
"diversityDecay": "none",
"searchMode": "ffn-activation-search",
"activationPool": [
"gelu",
"silu",
"relu",
"swiglu"
],
"searchStrategy": "evolutionary",
"populationSize": 4,
"generations": 2,
"selectionStrategy": "topk",
"tournamentK": 3,
"mutationRate": 0.5,
"stepsPerCandidate": 20,
"rankBy": "valLoss",
"perfWeight": 0,
"stabilityWeight": 0,
"writeReport": true,
"writeCandidates": true,
"writeSummary": true
}
}