A
Alpha
historic_20260226173948_2oewstalechat17.44M params21m 52s elapsed · Updated 46d ago
8L / 384D / 8H · helios · bpe-4k · adamw· Created Feb 26, 2026 5:40 PM
Step 446 / 50,0000.9%
7.3995
Loss?
7.3995
Best Loss?
-11.3% from start
-
Val Loss?
2.51e-5
Learning Rate?
1,943
Throughput?
tok/s (avg)
5,272
Speed?
ms/iter (avg)
0.508
Grad Norm?
avg: 0.815
4.46M
Tokens
processed
370ms
Forward
7% of step
4845ms
Backward
92% of step
18ms
GPU Sync
0% of step
850
GPU Ops
per step
0.7%
MFU
model FLOPS util
13.1x
Bwd/Fwd
ratio
Loss Curve ? click any chart to add markers
?
?
?
?
Architecture
Layers?8
Embedding?384
Heads?8
Vocab?4,000
Context?512
Dropout?0.1
Parameters?17.44M
Training Config
Total iters?50,000
Batch size?20
Max LR?0.00005
Optimizer?adamw
Backend?helios
Tokenizer?bpe-4k
Seed?42
Weight decay?0.1
Grad clip?1
Eval interval?500
Throughput (tok/s)
Step Time (ms/iter)
GPU & VRAM
Perplexity
Train/Val Gap
No validation data
Learning Rate
Grad Norm
Smoothed Loss (EMA)
Loss Velocity
Gradient Clipping
GPU Operations
Step Time Breakdown
Forward
Backward
Grad Norm
Optimizer
GPU Sync
Data
Timing Phase Lines
Backward / Forward Ratio

Evolutionary Analysis (Symbiogenesis)

1.79
Wt Entropy
bits
20.0
Eff. Rank
7.4637
Free Energy
3.906
Pop Entropy
nats
0.0755
Complexity
0.0429
Fitness
430
CUSUM
alerts
-
Batch Size
adaptive
CUSUM Statistical Monitors
Information Bottleneck (MI)
MI Analysis Pending

Transformer Layer Analysis

Gradient Norm Heatmap
Per-Layer Gradient Evolution
Checkpoints (0) ?
No checkpoints saved
Sample Generations (3)
#CheckpointPrompt (preview)Generated
1-<|user|> Hello, how are you? <|assistant|>46d ago
Prompt
<|user|> Hello, how are you? <|assistant|>
Output
<|user|> Hello, how are you? <|assistant|>d; afthat of and as ers estrewith to is the of is the inand it of the . <|assistant|> esof the s that is a with the the or —infinthat s es and ing a o, and ininand is the t
2-<|user|> What do you like to do for fun? <|assistant|>46d ago
Prompt
<|user|> What do you like to do for fun? <|assistant|>
Output
<|user|> What do you like to do for fun? <|assistant|>—the , it a and aninthe faerit is a ing of ewith that that s we true ns, s, into ined by a is a , ing the in the for int s, . <|assistant|> s in to re; to of and of
3-<|user|> Tell me about yourself. <|assistant|>46d ago
Prompt
<|user|> Tell me about yourself. <|assistant|>
Output
<|user|> Tell me about yourself. <|assistant|>with a to da es, s s, of the it es is the , es in not stthat t eres, as a fas n; t of the —it s, aninthat and is a ing and or not . <|assistant|> ed by in er and our is the t
Model Config (JSON)
{
  "vocabSize": 4000,
  "blockSize": 512,
  "nLayer": 8,
  "nEmbd": 384,
  "nHead": 8,
  "dropout": 0.1,
  "ffnActivation": "swiglu",
  "ffnDim": 1024
}
Training Config (JSON)
{
  "iters": 50000,
  "batchSize": 20,
  "lr": 0.00005,
  "lrMin": 0.000005,
  "warmupIters": 1000,
  "beta1": 0.9,
  "beta2": 0.95,
  "eps": 0.000001,
  "weightDecay": 0.1,
  "gradClip": 1,
  "evalInterval": 500,
  "evalIters": 10,
  "seed": 42,
  "backend": "helios",
  "tokenizer": "bpe-4k",
  "optimizer": "adamw",
  "logLevel": "info",
  "trace": false,
  "gradAccumSteps": 1,
  "sampleInterval": 300,
  "spikeThreshold": 10,
  "syncEvery": 1,
  "gcEvery": 0,
  "packed": false,
  "symbio": true,
  "symbioConfig": {
    "cusumSensitivity": 4,
    "cusumBaselineWindow": 5,
    "metricsInterval": 10,
    "trackWeightEntropy": true,
    "trackEffectiveRank": true,
    "trackFreeEnergy": true,
    "trackMIProfiles": false,
    "trackPopulationMetrics": true,
    "freeEnergyBeta": 0.01,
    "miNumBins": 30,
    "adaptiveBatch": false,
    "batchMin": 8,
    "batchMax": 64,
    "batchStep": 4,
    "calmStepsBeforeRestore": 200,
    "populationAdaptation": true,
    "populationScaleMin": 0.5,
    "populationScaleMax": 2,
    "populationScaleStep": 0.125,
    "populationAdaptationCooldown": 10,
    "mutationRateMin": 0.2,
    "mutationRateMax": 0.95,
    "fitnessAlpha": 1,
    "complexityMode": "entropy",
    "diversityBonus": 0.1,
    "diversityDecay": "cosine",
    "searchMode": "composed-activation-search",
    "activationPool": [
      "gelu",
      "relu",
      "silu",
      "swiglu",
      "universal",
      "kan_spline"
    ],
    "searchStrategy": "evolutionary",
    "populationSize": 8,
    "generations": 250,
    "selectionStrategy": "topk",
    "tournamentK": 3,
    "mutationRate": 0.7,
    "stepsPerCandidate": 25,
    "rankBy": "valLoss",
    "perfWeight": 0,
    "stabilityWeight": 0,
    "preserveWeightsAcrossCandidates": true,
    "carryOptimizerStateAcrossCandidates": true,
    "constantFfnDimAcrossCandidates": true,
    "fuseWeightsEachStep": true,
    "fusionShadowEma": 0.02,
    "fusionBaseStrength": 0.0015,
    "fusionMaxStrength": 0.02,
    "kuramotoCoupling": 0.7,
    "kuramotoDt": 0.1,
    "kuramotoDamping": 0.05,
    "writeReport": true,
    "writeCandidates": true,
    "writeSummary": true,
    "basisPool": [
      "silu",
      "relu",
      "gelu",
      "identity",
      "square"
    ],
    "maxGraphDepth": 4,
    "maxGraphNodes": 10
  }
}