{
  "schema_version": "1.1.0",
  "title": "Convexly negative-result registry",
  "description": "Every rejected hypothesis Convexly has pre-registered and tested, with date, AsPredicted ID, effect-size CI, and the specific claim that was disproved. Maintained as part of the Layer 1 audit-infrastructure moat per docs/strategy/2026-04-30-comprehensive-functions-plan.md A.2.6. Asset-first publishing of negative results is locked methodology discipline; nothing is removed once it lands here.",
  "maintained_at": "https://www.convexly.app/research/negative-results.json",
  "page_url": "https://www.convexly.app/research/negative-results",
  "audit_chain_verify_url": "https://www.convexly.app/research/verify",
  "last_updated_utc": "2026-05-10",
  "entries": [
    {
      "id": "neg-001-v282-skill-weighted-aggregation",
      "title": "Skill-weighted aggregation as a per-market price prior",
      "verdict": "Rejected",
      "summary": "Twenty-four pre-registered skill-weighted aggregator variants (linear, exponential, rank-based, top-k, conviction-weighted, posture-weighted, calibration-weighted, multiplicative, et al.) were tested as forecasts. All twenty-four were worse than the market price. The wash-trader-filtered variant Brier delta CI [+0.16028, +0.19287] sits inside the pre-registered TOST equivalence range [+0.154, +0.204]; movement after wash filtering was +0.00243 Brier (1.4% relative).",
      "specific_claim_disproved": "Aggregating Polymarket trader forecasts by any skill-weighted shape produces a per-market probability prior that is more accurate than the market price.",
      "effect_size_ci_brier_delta": "+0.179 [+0.16028, +0.19287]",
      "effect_size_direction": "Aggregator was WORSE than market price by 17.9% Brier on average.",
      "tost_equivalence_range_pre_registered": "[+0.154, +0.204]",
      "filed_at_utc": "2026-04-25",
      "ran_at_utc": "2026-04-26",
      "wash_filter_robustness_test_at_utc": "2026-04-29",
      "as_predicted_ids": [
        "#287436",
        "#287442",
        "#287714",
        "#287983"
      ],
      "paper_url": "https://www.convexly.app/research/marketalpha-v2",
      "data_bundle_url": "https://www.convexly.app/research/marketalpha-v2/v2-8-2-data-bundle.tar.gz",
      "what_remains_open": [
        "Per-wallet ranking (V1, OOF Spearman +0.514, NOT closed by V2.8.2)",
        "Within-wallet z-score for insider-pattern detection MVP (NOT closed)",
        "Wash-trading detection (Sirolly methodology, NOT closed)",
        "Structural / constraint-projection inference (CME V0.1, uses MARKET PRICES not skill-weighted aggregation as forecast input, NOT closed)"
      ],
      "implications_for_product": "Bright line: when evaluating any new commercial proposal, check first 'does it use skill-weighted aggregation as a per-market price prior?' If yes, reject unless there's a substantive reason to believe the prior shape was wrong vs the 24 already-rejected shapes.",
      "analysis_date": "2026-04-27",
      "analysis_completed_at_utc": "2026-04-27T23:14:39Z",
      "effect_size_metric": "Per-market Brier delta vs market-implied baseline; positive means the skill-weighted aggregator was worse.",
      "effect_size_ci": "+0.179, primary W-EXP beta=4 a=2.0 S-RAW, 95% CI [+0.164, +0.1935]; best of 24 variants still +0.0570, 95% CI [+0.0486, +0.0654].",
      "pre_registered_decision_rule": "Reject H0 only if delta_market < -0.005 and CI_hi < 0.0. Observed deltas were positive for all 24 variants, so zero variants beat the market-implied baseline.",
      "receipt_status": "pending_public_url"
    },
    {
      "id": "neg-002-v15-e2-per-wallet-temporal-holdout",
      "title": "V1 V3b composite survives strict per-wallet temporal holdout",
      "verdict": "Rejected",
      "summary": "V1.5 experiment E2 (pre-registered): refit V3b OLS on a 2024-01-01 to 2025-09-30 training window with a 14-day purge, score on a 2025-10-15 to 2026-04-15 held-out window. Frozen V1 coefficients applied to training-window pillars produced Spearman ρ = +0.111 with 95% CI [+0.046, +0.175] and p ≈ 0.001. Positive and the CI excludes zero, but well below the pre-registered ρ ≥ +0.30 pass threshold. The fail is reported per the pre-reg's no-post-hoc-re-analysis rule.",
      "specific_claim_disproved": "The V1 V3b composite, fit on full-cohort data and applied to training-window pillars, predicts forward held-out PnL at Spearman ρ ≥ +0.30 on the V1-M cohort.",
      "effect_size_ci_brier_delta": "Spearman ρ = +0.111, 95% CI [+0.046, +0.175], p ≈ 0.001",
      "effect_size_direction": "Positive but below the +0.30 pre-registered threshold.",
      "tost_equivalence_range_pre_registered": "Pre-registered ρ ≥ +0.30 pass criterion; observed +0.111 fails the threshold.",
      "filed_at_utc": "2026-04-25",
      "ran_at_utc": "2026-04-27",
      "as_predicted_ids": [
        "#287368"
      ],
      "paper_url": "https://www.convexly.app/research/edge-score-methodology-v1-5",
      "what_remains_open": [
        "V3b as a CROSS-SECTIONAL ranker (the per-period ranking + S4 partial Spearman +0.494 controlling for log capital remain open)",
        "V3b as a per-wallet TEMPORAL forecast: closed by E2; do not market it as such"
      ],
      "implications_for_product": "V3b is positioned as a cross-sectional ranker, not a per-wallet temporal predictor and not a forecast-aggregation weight. External framing should never claim 'V3b predicts your forward PnL across windows' without citing this E2 rejection.",
      "analysis_date": "2026-04-27",
      "analysis_completed_at_utc": "2026-04-27T19:17:00Z",
      "effect_size_metric": "Spearman rank correlation between V3b score and held-out signed log PnL.",
      "effect_size_ci": "Primary refit V3b: rho = -0.0821, 95% CI [-0.149, -0.006], N = 805 paired wallets. Secondary frozen V1 V3b: rho = +0.1114, 95% CI [+0.046, +0.175].",
      "pre_registered_decision_rule": "Pass required rho >= +0.30 and CI lower bound > 0. The secondary frozen-coefficient result was positive but below the +0.30 threshold; the primary refit result was negative.",
      "receipt_status": "verified_external_287368"
    },
    {
      "id": "neg-003-v15-e7-per-quarter-ic-stability",
      "title": "V1 V3b per-quarter IC stability across the pre-registered window",
      "verdict": "Rejected",
      "summary": "V1.5 experiment E7 (pre-registered): bucket V1-M positions by quarter, recompute V3b with quarter-local standardization, compute Spearman of frozen V3b vs signed log PnL in each quarter. Pre-registered window 2024Q1 to 2025Q2 (six quarters); 5 of 6 quarters present (2024Q1 had 29 wallets, below the ≥5-position cohort filter). Median per-quarter Spearman = +0.038, range across quarters [-0.164, +0.155]; 3 of 5 quarters positive. Pre-registered pass criterion required median ρ ≥ +0.30 AND ≥5/6 positive quarters. Both legs failed.",
      "specific_claim_disproved": "The frozen V1 V3b composite produces a stable per-quarter Spearman with realized log PnL across the 2024Q1 to 2025Q2 window.",
      "effect_size_ci_brier_delta": "Median per-quarter Spearman = +0.038, range [-0.164, +0.155] across 5 quarters",
      "effect_size_direction": "Per-quarter IC is unstable; one quarter (2025Q1) is meaningfully negative with CI [-0.221, -0.102].",
      "tost_equivalence_range_pre_registered": "Pre-registered: median ρ ≥ +0.30 AND ≥5/6 positive quarters. Observed: median +0.038, 3 of 5 positive. Both legs fail.",
      "filed_at_utc": "2026-04-25",
      "ran_at_utc": "2026-04-27",
      "as_predicted_ids": [
        "#287368"
      ],
      "paper_url": "https://www.convexly.app/research/edge-score-methodology-v1-5",
      "what_remains_open": [
        "Exploratory extension to 2025Q3 to 2026Q2 (NOT in pre-reg) shows 2026Q1 ρ = +0.281, CI [+0.249, +0.314]; whether IC stability returns in more recent windows is open and pre-registered for replication as #11 in the 12-month plan",
        "Cross-quarter autocorrelation of per-quarter ρ values is +0.366 lag-1, so when the V3b signal is on it tends to persist; the failed E7 test was on the level of the IC, not its persistence"
      ],
      "implications_for_product": "V3b is not marketed as a stable per-quarter IC predictor. A genuine forward out-of-sample publication (#11 in the 12-month plan, ships 2026-10-01) is the next opportunity to revisit the per-quarter stability claim with a fresh pre-reg. Note: the existing rolling-Spearman series is an in-sample contemporaneous diagnostic with target leakage (score and PnL share positions; conviction is PnL-derived), not a forward test.",
      "analysis_date": "2026-04-27",
      "analysis_completed_at_utc": "2026-04-27T19:17:00Z",
      "effect_size_metric": "Median quarterly Spearman rank correlation of frozen V3b vs signed log PnL.",
      "effect_size_ci": "Median per-quarter rho = +0.038 across 5 present quarters; per-quarter range [-0.164, +0.155]. 2025Q1 rho = -0.164, 95% CI [-0.221, -0.102].",
      "pre_registered_decision_rule": "Pass required median rho >= +0.30 and at least 5 of 6 quarters positive. Observed median was +0.038 and 3 of 5 present quarters were positive.",
      "receipt_status": "verified_external_287368"
    }
  ],
  "policy": {
    "addition_rule": "An entry is added the moment a pre-registered hypothesis is rejected by the test it specified. Entries are never removed; corrections to the analysis are added as additional fields, not by overwriting prior content.",
    "verification": "Each entry exposes analysis_date, effect_size_metric, effect_size_ci, pre_registered_decision_rule, and AsPredicted IDs. Public receipt health is governed by /research/preregistrations.json; pending-public-url entries are not treated as externally verified until their public AsPredicted page resolves to the expected ID, title, and filing date.",
    "credibility_loaded_term_audit": "This file passes the credibility-loaded-term audit per CLAUDE.md: every effect-size estimate has a CI in the same line; 'rejected' refers to a formal pre-registered test, not informal opinion; 'robust' and 'near zero' do not appear without their CI / TOST counterpart; only pre-registered tests are listed (V1.5 S6 was an exploratory follow-up, not a pre-registered hypothesis, and is therefore not a registry entry)."
  }
}
