{"publication_id":"6c57c982-baf4-481a-ae96-487d29a8299d","content_hash":"sha256:b1d753d787d0a23d0276a9b5390e14b67f0b234e13dfe51f8775b952018eeae9","nodes":[{"id":"6c57c982-baf4-481a-ae96-487d29a8299d","type":"publication","title":"Model eval: Medqa Accuracy is the shared direct-receipt signal"},{"id":"claim_1","type":"claim","text":"Interpretation note:** This is a hypothesis-generating alpha memo, not confirmatory evidence; subgroup or context-derived claims require independent replication."},{"id":"claim_2","type":"claim","text":"Bounded research question:** Do independent direct receipts on Medqa continue to support a signal on Accuracy for the cited systems when comparators are kept explicit?"},{"id":"claim_3","type":"claim","text":"Treat this as a benchmark-shaped evidence bundle, not a broad claim about the whole topic. The next extraction should preserve model, baseline, and protocol fields for each receipt."},{"id":"claim_4","type":"claim","text":"_No direct opposing receipt was selected by this run. Treat that as a bundle limitation, not a claim that the wider literature has no counter-evidence._"},{"id":"source_1","type":"source","study":"Large Language Models Encode Clinical Knowledge","year":2022,"doi":"10.48550/arxiv.2212.13138","url":null,"population":"not extracted","intervention_or_exposure":"not extracted","comparator":"not extracted","endpoint":"not extracted","effect":"not extracted","risk_of_bias":"not appraised in public sidecar","directness":"primary"},{"id":"source_2","type":"source","study":"Large language models encode clinical knowledge","year":2023,"doi":"10.1038/s41586-023-06291-2","url":null,"population":"not extracted","intervention_or_exposure":"not extracted","comparator":"not extracted","endpoint":"not extracted","effect":"not extracted","risk_of_bias":"not appraised in public sidecar","directness":"primary"},{"id":"source_3","type":"source","study":"FUO_ED: A Dataset for Evaluating the Performance of Large Language Models in Diagnosing Complex Cases of Fever of Unknown Origin","year":2024,"doi":"10.1145/3718391.3718410","url":null,"population":"not extracted","intervention_or_exposure":"not extracted","comparator":"not extracted","endpoint":"not extracted","effect":"not extracted","risk_of_bias":"not appraised in public sidecar","directness":"primary"},{"id":"source_4","type":"source","study":"OpenMedLM: prompt engineering can out-perform fine-tuning in medical question-answering with open-source large language models","year":2024,"doi":"10.1038/s41598-024-64827-6","url":null,"population":"not extracted","intervention_or_exposure":"not extracted","comparator":"not extracted","endpoint":"not extracted","effect":"not extracted","risk_of_bias":"not appraised in public sidecar","directness":"primary"},{"id":"source_5","type":"source","study":"Benchmarking large language model-based agent systems for clinical decision tasks.","year":2026,"doi":"10.1038/s41746-026-02443-6","url":null,"population":"not extracted","intervention_or_exposure":"not extracted","comparator":"not extracted","endpoint":"not extracted","effect":"not extracted","risk_of_bias":"not appraised in public sidecar","directness":"primary"}],"edges":[{"from":"6c57c982-baf4-481a-ae96-487d29a8299d","to":"claim_1","type":"contains_claim"},{"from":"6c57c982-baf4-481a-ae96-487d29a8299d","to":"claim_2","type":"contains_claim"},{"from":"6c57c982-baf4-481a-ae96-487d29a8299d","to":"claim_3","type":"contains_claim"},{"from":"6c57c982-baf4-481a-ae96-487d29a8299d","to":"claim_4","type":"contains_claim"}],"screening":{"identified":5,"screened":5,"excluded":0,"included":5,"included_or_retained":5,"flow":["identified","screened","excluded_with_reasons","included"],"wording":"5 candidate receipts retained after source retrieval, deduplication, and topic filtering. This is an evidence-map screening trace, not a PRISMA full-text exclusion audit.","exclusion_reasons":["No PRISMA full-text exclusion-stage filter was applied."]}}