A
Alpha
historic_20260225214009_o4t6stalechat17.44M params5m 19s elapsed · Updated 47d ago
8L / 384D / 8H · helios · bpe-4k · adamw· Created Feb 25, 2026 9:41 PM
Step 99 / 50,0000.2%
8.0486
Loss?
8.0457
Best Loss?
-3.9% from start
-
Val Loss?
9.46e-6
Learning Rate?
5,670
Throughput?
tok/s (avg)
1,843
Speed?
ms/iter (avg)
0.635
Grad Norm?
avg: 0.745
1.01M
Tokens
processed
338ms
Forward
18% of step
1419ms
Backward
77% of step
25ms
GPU Sync
1% of step
767
GPU Ops
per step
1.9%
MFU
model FLOPS util
4.2x
Bwd/Fwd
ratio
Loss Curve ? click any chart to add markers
?
?
?
?
Architecture
Layers?8
Embedding?384
Heads?8
Vocab?4,000
Context?512
Dropout?0.1
Parameters?17.44M
Training Config
Total iters?50,000
Batch size?20
Max LR?0.00005
Optimizer?adamw
Backend?helios
Tokenizer?bpe-4k
Seed?42
Weight decay?0.1
Grad clip?1
Eval interval?500
Throughput (tok/s)
Step Time (ms/iter)
GPU & VRAM
Perplexity
Train/Val Gap
No validation data
Learning Rate
Grad Norm
Smoothed Loss (EMA)
Loss Velocity
Gradient Clipping
GPU Operations
Step Time Breakdown
Forward
Backward
Grad Norm
Optimizer
GPU Sync
Data
Timing Phase Lines
Backward / Forward Ratio

Evolutionary Analysis (Symbiogenesis)

1.80
Wt Entropy
bits
20.0
Eff. Rank
8.0967
Free Energy
3.911
Pop Entropy
nats
0.0756
Complexity
0.0345
Fitness
85
CUSUM
alerts
-
Batch Size
adaptive
CUSUM Statistical Monitors
Information Bottleneck (MI)
MI Analysis Pending
Checkpoints (0) ?
No checkpoints saved
Model Config (JSON)
{
  "vocabSize": 4000,
  "blockSize": 512,
  "nLayer": 8,
  "nEmbd": 384,
  "nHead": 8,
  "dropout": 0.1,
  "ffnActivation": "swiglu",
  "ffnDim": 1024
}
Training Config (JSON)
{
  "iters": 50000,
  "batchSize": 20,
  "lr": 0.00005,
  "lrMin": 0.000005,
  "warmupIters": 1000,
  "beta1": 0.9,
  "beta2": 0.95,
  "eps": 0.000001,
  "weightDecay": 0.1,
  "gradClip": 1,
  "evalInterval": 500,
  "evalIters": 10,
  "seed": 42,
  "backend": "helios",
  "tokenizer": "bpe-4k",
  "optimizer": "adamw",
  "logLevel": "info",
  "trace": false,
  "gradAccumSteps": 1,
  "sampleInterval": 300,
  "spikeThreshold": 10,
  "syncEvery": 1,
  "gcEvery": 0,
  "packed": false,
  "symbio": true,
  "symbioConfig": {
    "cusumSensitivity": 4,
    "cusumBaselineWindow": 5,
    "metricsInterval": 10,
    "trackWeightEntropy": true,
    "trackEffectiveRank": true,
    "trackFreeEnergy": true,
    "trackMIProfiles": false,
    "trackPopulationMetrics": true,
    "freeEnergyBeta": 0.01,
    "miNumBins": 30,
    "adaptiveBatch": false,
    "batchMin": 8,
    "batchMax": 64,
    "batchStep": 4,
    "calmStepsBeforeRestore": 200,
    "populationAdaptation": true,
    "populationScaleMin": 0.5,
    "populationScaleMax": 2,
    "populationScaleStep": 0.125,
    "populationAdaptationCooldown": 10,
    "mutationRateMin": 0.2,
    "mutationRateMax": 0.95,
    "fitnessAlpha": 1,
    "complexityMode": "entropy",
    "diversityBonus": 0.1,
    "diversityDecay": "cosine",
    "searchMode": "composed-activation-search",
    "activationPool": [
      "gelu",
      "relu",
      "silu",
      "swiglu",
      "universal",
      "kan_spline"
    ],
    "searchStrategy": "evolutionary",
    "populationSize": 8,
    "generations": 250,
    "selectionStrategy": "topk",
    "tournamentK": 3,
    "mutationRate": 0.7,
    "stepsPerCandidate": 25,
    "rankBy": "valLoss",
    "perfWeight": 0,
    "stabilityWeight": 0,
    "preserveWeightsAcrossCandidates": true,
    "carryOptimizerStateAcrossCandidates": true,
    "constantFfnDimAcrossCandidates": true,
    "fuseWeightsEachStep": true,
    "fusionShadowEma": 0.02,
    "fusionBaseStrength": 0.0015,
    "fusionMaxStrength": 0.02,
    "kuramotoCoupling": 0.7,
    "kuramotoDt": 0.1,
    "kuramotoDamping": 0.05,
    "writeReport": true,
    "writeCandidates": true,
    "writeSummary": true,
    "basisPool": [
      "silu",
      "relu",
      "gelu",
      "identity",
      "square"
    ],
    "maxGraphDepth": 4,
    "maxGraphNodes": 10
  }
}