Building Cohorts from Concept Sets

library(tidyOhdsiSolutions)

Overview

cohortFromConceptSet() builds a complete CirceR-compatible cohort definition from one or more concept set expressions. It produces a nested R list that can be serialized to valid JSON with cohortToJson() — no Java, CirceR, or Capr dependency required.

Typical Workflow

data.frame ──► toConceptSet() ──► cohortFromConceptSet() ──► cohortToJson()
  1. Start with data frames containing concept_id (and optional metadata).
  2. Convert them to CIRCE concept set expressions with toConceptSet() or toConceptSets().
  3. Pass a named list of expressions to cohortFromConceptSet().
  4. Serialize the result to JSON with cohortToJson().

Step 1: Define Concept Sets as Data Frames

Each concept set starts as a data frame with at minimum a concept_id column. Optional columns control inclusion flags and provide metadata:

diabetes_df <- data.frame(
  concept_id       = c(201826L, 442793L),
  concept_name     = c("Type 2 diabetes mellitus",
                        "Diabetes mellitus due to insulin resistance"),
  domain_id        = c("Condition", "Condition"),
  vocabulary_id    = c("SNOMED", "SNOMED"),
  standard_concept = c("S", "S"),
  descendants      = c(TRUE, TRUE),
  excluded         = c(FALSE, FALSE)
)

hypertension_df <- data.frame(
  concept_id       = 320128L,
  concept_name     = "Essential hypertension",
  domain_id        = "Condition",
  vocabulary_id    = "SNOMED",
  standard_concept = "S",
  descendants      = TRUE,
  excluded         = FALSE
)

Step 2: Convert to Concept Set Expressions

diabetes_cs     <- toConceptSet(diabetes_df, name = "Type 2 Diabetes")
hypertension_cs <- toConceptSet(hypertension_df, name = "Hypertension")

Or convert multiple at once with toConceptSets():

all_cs <- toConceptSets(list(
  "Type 2 Diabetes" = diabetes_df,
  "Hypertension"    = hypertension_df
))

Step 3: Build the Cohort

Pass a named list of concept set expressions to cohortFromConceptSet():

cohort <- cohortFromConceptSet(
  conceptSetList = all_cs,
  limit          = "earliest",
  requiredObservation = c(365L, 0L),
  end            = "observation_period_end_date"
)

Parameters

Argument Values Description
conceptSetList named list Each element is a concept set expression with $items
limit "earliest", "all", "latest" Which qualifying event(s) to keep
requiredObservation c(prior, post) Days of continuous observation required before and after the index date
end "observation_period_end_date", "fixed_exit", "drug_exit" How the cohort era ends
endArgs list(...) Extra parameters for the chosen end strategy
addSourceCriteria TRUE / FALSE Also match on source (non-standard) concept codes

Step 4: Export to JSON

json <- cohortToJson(cohort)
cat(substr(json, 1, 300), "...\n")
#> {
#>   "ConceptSets": [
#>     {
#>       "id": 0,
#>       "name": "Type 2 Diabetes",
#>       "expression": {
#>         "items": [
#>           {
#>             "concept": {
#>               "CONCEPT_ID": 201826,
#>               "CONCEPT_NAME": "Type 2 diabetes mellitus",
#>               "STANDARD_CONCEPT": "S",
#>                ...

The JSON string is ready for CirceR::cohortExpressionFromJson() or CirceR::buildCohortQuery(), or can be saved to a file:

writeLines(json, "my_cohort.json")

End Strategies

Default: observation period end date

The cohort era ends when the person’s observation period ends.

cohort_obs <- cohortFromConceptSet(
  all_cs,
  end = "observation_period_end_date"
)

Fixed exit: offset from index

End the cohort era a fixed number of days after the start (or end) date.

cohort_fixed <- cohortFromConceptSet(
  all_cs,
  end     = "fixed_exit",
  endArgs = list(index = "startDate", offsetDays = 180)
)

# Verify
cohort_fixed$EndStrategy$DateOffset
#> $DateField
#> [1] "StartDate"
#> 
#> $Offset
#> [1] 180

Drug exit: era-based persistence

For drug exposures, end the cohort using drug era logic with configurable gap and surveillance windows.

drug_df <- data.frame(
  concept_id   = 1503297L,
  concept_name = "Metformin",
  domain_id    = "Drug",
  vocabulary_id = "RxNorm",
  standard_concept = "S",
  descendants  = TRUE,
  excluded     = FALSE
)

drug_cs <- toConceptSets(list("Metformin" = drug_df))

cohort_drug <- cohortFromConceptSet(
  drug_cs,
  end     = "drug_exit",
  endArgs = list(persistenceWindow = 30, surveillanceWindow = 7)
)

# Verify
cohort_drug$EndStrategy$CustomEra
#> $DrugCodesetId
#> [1] 0
#> 
#> $GapDays
#> [1] 30
#> 
#> $Offset
#> [1] 7

Event Limits

# Keep only the earliest qualifying event per person
earliest <- cohortFromConceptSet(all_cs, limit = "earliest")
earliest$PrimaryCriteria$PrimaryCriteriaLimit$Type
#> [1] "First"

# Keep all qualifying events
all_events <- cohortFromConceptSet(all_cs, limit = "all")
all_events$PrimaryCriteria$PrimaryCriteriaLimit$Type
#> [1] "All"

# Keep only the latest qualifying event
latest <- cohortFromConceptSet(all_cs, limit = "latest")
latest$PrimaryCriteria$PrimaryCriteriaLimit$Type
#> [1] "Last"

Source Criteria

When addSourceCriteria = TRUE, each domain gets an additional criteria entry that matches on source (non-standard) concept codes. This doubles the number of primary criteria entries:

cohort_src <- cohortFromConceptSet(all_cs, addSourceCriteria = TRUE)
length(cohort_src$PrimaryCriteria$CriteriaList)
#> [1] 4

# Without source criteria
cohort_plain <- cohortFromConceptSet(all_cs, addSourceCriteria = FALSE)
length(cohort_plain$PrimaryCriteria$CriteriaList)
#> [1] 2

Structure of the Output

The returned list mirrors the CirceR cohort expression format:

names(cohort)
#> [1] "ConceptSets"       "PrimaryCriteria"   "QualifiedLimit"   
#> [4] "ExpressionLimit"   "InclusionRules"    "EndStrategy"      
#> [7] "CensoringCriteria" "CollapseSettings"  "CensorWindow"
# Number of concept sets
length(cohort$ConceptSets)
#> [1] 2

# Names of concept sets
vapply(cohort$ConceptSets, `[[`, character(1), "name")
#> [1] "Type 2 Diabetes" "Hypertension"

# Observation window
cohort$PrimaryCriteria$ObservationWindow
#> $PriorDays
#> [1] 365
#> 
#> $PostDays
#> [1] 0