| Title: | Automated Data Quality Checks for Recurring Dataset Deliveries |
|---|---|
| Description: | Automates quality verification of recurring external dataset deliveries. For each new file arrival, it runs single-snapshot quality checks, compares the file to the previous delivery, writes a self-contained 'HTML' report, and records summary statistics in a local 'SQLite' database for long-term trend tracking. Supports 'CSV' and fixed-width formats. Custom organisation-specific checks can be supplied as plain R files. |
| Authors: | Mick Mioduszewski [aut, cre] |
| Maintainer: | Mick Mioduszewski <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 0.2.1 |
| Built: | 2026-06-08 00:23:51 UTC |
| Source: | https://github.com/mickmioduszewski/dqcheckr |
For each column that has allowed_values configured in
config$column_rules, returns a dq_result flagging any
non-empty values not in the allowed list. Returns an empty list when no
allowed_values rules are configured.
check_allowed_values(df, config)check_allowed_values(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per configured column.
Status is "FAIL" when unexpected values are found; "PASS"
otherwise. Returns an empty list if no allowed_values rules are
configured.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_allowed_values(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_allowed_values(df, cfg)
Returns a single "INFO" dq_result recording the number
of columns in the data frame. Never fails or warns.
check_col_count(df, config)check_col_count(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list containing one dq_result with status
"INFO".
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_col_count(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_col_count(df, cfg)
For each column whose resolved type is "character", returns one
"INFO" dq_result with the count of distinct non-empty
values. Columns inferred as numeric or date are silently skipped.
check_distinct_counts(df, config)check_distinct_counts(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects (one per character column),
all with status "INFO". Returns an empty list if no character
columns are found.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_distinct_counts(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_distinct_counts(df, cfg)
Returns a single dq_result for the whole table. A row is
considered a duplicate when every column value is identical to another row.
check_duplicate_rows(df, config)check_duplicate_rows(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list containing one dq_result.
Status is "WARN" if any duplicate rows exist; "PASS"
otherwise.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_duplicate_rows(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_duplicate_rows(df, cfg)
Returns a dq_result per column. A column is considered empty
when every value is NA or the empty string "".
check_empty_column(df, config)check_empty_column(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per column.
Status is "FAIL" for entirely empty columns; "PASS"
otherwise.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_empty_column(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_empty_column(df, cfg)
Returns one "INFO" dq_result per column recording the
type resolved by resolve_col_type ("date",
"numeric", "character", or "unknown").
Per-column overrides from config$column_types are respected.
check_inferred_types(df, config)check_inferred_types(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per column, all with
status "INFO".
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_inferred_types(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_inferred_types(df, cfg)
Checks that the column(s) listed in config$key_columns have no
duplicate values. When key_columns is a single string, one result is
returned for that column. When it is a character vector of length > 1, a
single result covering the composite key is returned. Returns an empty list
if key_columns is not configured.
check_key_uniqueness(df, config)check_key_uniqueness(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects. Status is "FAIL"
when duplicates or missing key columns are detected; "PASS"
otherwise. Returns an empty list if key_columns is not configured.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_key_uniqueness(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_key_uniqueness(df, cfg)
Runs up to three sub-checks, each returning a separate
dq_result:
File size – only when file_path is supplied and
max_file_size_mb is configured in rules: FAIL if the file
exceeds the size limit.
Minimum row count – FAIL if row_count <
min_row_count. Skipped (PASS with a note) when min_row_count
is 0.
Maximum row count – only when max_row_count is
configured in rules: FAIL if row_count > max_row_count.
check_min_row_count(df, config, file_path = NULL)check_min_row_count(df, config, file_path = NULL)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
file_path |
Character or |
A list of dq_result objects (one to three entries
depending on which sub-checks are active).
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_min_row_count(df, cfg, file_path = path)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_min_row_count(df, cfg, file_path = path)
Returns a dq_result per column flagging columns whose
proportion of missing or empty values exceeds max_missing_rate.
check_missing_rate(df, config)check_missing_rate(df, config)
df |
A data frame with all columns as character vectors. |
config |
Named list as returned by |
A list of dq_result objects, one per column.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_missing_rate(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_missing_rate(df, cfg)
For each column whose resolved type is "numeric", computes the
proportion of non-empty values that cannot be coerced to numeric. Returns
"FAIL" when the rate exceeds max_non_numeric_rate (default
0.01), "WARN" when it exceeds warn_non_numeric_rate (default
0), and "PASS" otherwise. Both thresholds support per-column
overrides via config$column_rules.
check_non_numeric(df, config)check_non_numeric(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per numeric column.
Returns an empty list if no numeric columns are found.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_non_numeric(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_non_numeric(df, cfg)
For each column that has min_value or max_value configured in
config$column_rules, returns a dq_result flagging any
values that fall outside the specified range. Returns an empty list when no
bound rules are configured.
check_numeric_bounds(df, config)check_numeric_bounds(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per configured column.
Status is "FAIL" when out-of-range values are found; "PASS"
otherwise. Returns an empty list if no bound rules are configured.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_numeric_bounds(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_numeric_bounds(df, cfg)
For each column whose resolved type is "numeric", returns one
"INFO" dq_result containing min, max, mean, and
standard deviation of the parseable values. Columns inferred as non-numeric
are silently skipped.
check_numeric_stats(df, config)check_numeric_stats(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects (one per numeric column),
all with status "INFO". Returns an empty list if no numeric columns
are found.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_numeric_stats(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_numeric_stats(df, cfg)
For each column whose resolved type is "numeric", applies up to two
outlier detection methods (combined with logical OR):
Z-score: values whose absolute Z-score exceeds
max_z_score are flagged.
IQR fence: values below Q1 - k * IQR or above
Q3 + k * IQR (where k = iqr_fence_multiplier) are
flagged.
Both thresholds support per-column overrides via config$column_rules.
A column is skipped (PASS with a note) when neither threshold is configured
or when it has fewer than four parseable values.
check_outliers(df, config)check_outliers(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per numeric column.
Status is "FAIL" when outliers are detected; "PASS"
otherwise. Returns an empty list if no numeric columns are found.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_outliers(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_outliers(df, cfg)
For each column that has a pattern configured in
config$column_rules, returns a dq_result reporting how
many non-empty values do not match the Perl-compatible regular expression.
Returns an empty list when no pattern rules are configured.
check_pattern(df, config)check_pattern(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects, one per configured column.
Status is "FAIL" when any values violate the pattern; "PASS"
otherwise. Returns an empty list if no pattern rules are configured.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_pattern(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_pattern(df, cfg)
Returns a single "INFO" dq_result recording the number
of rows in the data frame. Never fails or warns; use
check_min_row_count for threshold-based row count checks.
check_row_count(df, config)check_row_count(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list containing one dq_result with status
"INFO".
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_row_count(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_row_count(df, cfg)
Compares the columns present in df against
config$expected_columns:
SC-01: one "FAIL" result per column present in the
file but not listed in expected_columns.
SC-02: one "FAIL" result per column listed in
expected_columns but absent from the file.
Returns an empty list if expected_columns is not configured.
check_schema_contract(df, config)check_schema_contract(df, config)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects. Each schema violation
produces one "FAIL" result; a "PASS" result is emitted for
each sub-check when no violations are found. Returns an empty list if
expected_columns is not configured.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_schema_contract(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) check_schema_contract(df, cfg)
Reads two historical snapshot records (by ID) from the SQLite database and computes table-level, schema, and per-column statistical drift. Optionally renders an HTML drift report.
compare_snapshots( dataset_name, snapshot_id_prev = NULL, snapshot_id_curr = NULL, db_path = NULL, config_dir = ".", report = TRUE, open_report = interactive() )compare_snapshots( dataset_name, snapshot_id_prev = NULL, snapshot_id_curr = NULL, db_path = NULL, config_dir = ".", report = TRUE, open_report = interactive() )
dataset_name |
Character. Dataset name to compare. |
snapshot_id_prev |
Integer or |
snapshot_id_curr |
Integer or |
db_path |
Character or |
config_dir |
Character. Path to the directory containing
|
report |
Logical. Whether to render an HTML drift report. |
open_report |
Logical. Whether to open the HTML report in the browser after rendering (only takes effect in interactive sessions). |
Invisibly, a named list with elements dataset_name,
snap_prev, snap_curr, table_drift,
schema_changes, missing_rate_changes,
non_numeric_changes, mean_shifts, distinct_changes.
tmp <- tempdir() db_path <- file.path(tmp, "snap.sqlite") cfg_yml <- file.path(tmp, "dqcheckr.yml") ds_yml <- file.path(tmp, "starwars_csv.yml") dat <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") writeLines(c( paste0('snapshot_db: "', db_path, '"'), paste0('report_output_dir: "', tmp, '"'), 'default_rules:', ' max_missing_rate: 0.60', ' min_row_count: 80' ), cfg_yml) writeLines(c( 'dataset_name: "starwars_csv"', paste0('current_file: "', dat, '"'), 'format: csv', 'encoding: "UTF-8"', 'delimiter: ","' ), ds_yml) run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) drift <- compare_snapshots("starwars_csv", config_dir = tmp, report = FALSE) names(drift)tmp <- tempdir() db_path <- file.path(tmp, "snap.sqlite") cfg_yml <- file.path(tmp, "dqcheckr.yml") ds_yml <- file.path(tmp, "starwars_csv.yml") dat <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") writeLines(c( paste0('snapshot_db: "', db_path, '"'), paste0('report_output_dir: "', tmp, '"'), 'default_rules:', ' max_missing_rate: 0.60', ' min_row_count: 80' ), cfg_yml) writeLines(c( 'dataset_name: "starwars_csv"', paste0('current_file: "', dat, '"'), 'format: csv', 'encoding: "UTF-8"', 'delimiter: ","' ), ds_yml) run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) drift <- compare_snapshots("starwars_csv", config_dir = tmp, report = FALSE) names(drift)
Resolves the current and previous file paths from the configuration. If
current_file is set explicitly, it is used directly. Otherwise the
two most recently modified files in folder are used.
detect_files(config)detect_files(config)
config |
Named list. Merged configuration as returned by
|
A named list with elements current (character path) and
previous (character path or NULL).
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) cfg$current_file <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") files <- detect_files(cfg) files$currentcfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) cfg$current_file <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") files <- detect_files(cfg) files$current
Creates the atomic result unit returned by every check function.
dq_result( check_id, check_name, column = NA_character_, status, observed, threshold = NA_character_, message )dq_result( check_id, check_name, column = NA_character_, status, observed, threshold = NA_character_, message )
check_id |
Character. Short identifier for the check (e.g. |
check_name |
Character. Human-readable name of the check. |
column |
Character. Column the check applies to, or |
status |
Character. One of |
observed |
Character. What was observed (e.g. |
threshold |
Character. The configured threshold, or |
message |
Character. Human-readable description of the result. |
A named list with seven elements: check_id, check_name,
column, status, observed, threshold,
message.
dq_result("QC-01", "Missing rate", column = "age", status = "PASS", observed = "0% missing", message = "No missing values.")dq_result("QC-01", "Missing rate", column = "age", status = "PASS", observed = "0% missing", message = "No missing values.")
Classifies a character vector as "date", "numeric",
"character", or "unknown" by applying rules in priority order.
infer_col_type(x, threshold = 0.9)infer_col_type(x, threshold = 0.9)
x |
Character vector to classify (as read from a CSV or FWF file). |
threshold |
Numeric. Minimum proportion of non-empty values that must
parse as numeric for the column to be classified as |
A single character string: "date", "numeric",
"character", or "unknown".
infer_col_type(c("2024-01-01", "2024-06-15")) # "date" infer_col_type(c("1.5", "2.0", "3.1")) # "numeric" infer_col_type(c("high", "low", "medium")) # "character" infer_col_type(c(NA, "", NA)) # "unknown" infer_col_type(c(rep("1", 17), "a", "b", "c"), threshold = 0.80) # "numeric"infer_col_type(c("2024-01-01", "2024-06-15")) # "date" infer_col_type(c("1.5", "2.0", "3.1")) # "numeric" infer_col_type(c("high", "low", "medium")) # "character" infer_col_type(c(NA, "", NA)) # "unknown" infer_col_type(c(rep("1", 17), "a", "b", "c"), threshold = 0.80) # "numeric"
Returns a data frame of snapshot records for the given dataset (or all
datasets if dataset_name is NULL), ordered by dataset name
and snapshot ID.
list_snapshots(dataset_name = NULL, db_path = NULL)list_snapshots(dataset_name = NULL, db_path = NULL)
dataset_name |
Character or |
db_path |
Character. Path to the SQLite snapshot database. Required; there is no default (a relative default would be path-sensitive). |
A data frame with columns id, dataset_name,
file_name, run_timestamp, row_count,
overall_status. Returns an empty data frame if the database does not
exist or contains no matching records.
list_snapshots(db_path = tempfile(fileext = ".sqlite"))list_snapshots(db_path = tempfile(fileext = ".sqlite"))
Reads the global dqcheckr.yml and the dataset-specific YAML, merging
rule_overrides from the dataset config on top of default_rules
from the global config. Top-level keys snapshot_db and
report_output_dir are inherited from the global config when absent
from the dataset config.
load_config(dataset_name, config_dir)load_config(dataset_name, config_dir)
dataset_name |
Character. Dataset name; must match
|
config_dir |
Character. Path to the directory containing both YAML files. |
A named list representing the merged configuration.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) cfg$formatcfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) cfg$format
Returns the single worst status in precedence order:
"FAIL" > "WARN" > "PASS" > "INFO".
overall_status(results)overall_status(results)
results |
A list of |
A single character string: "FAIL", "WARN",
"PASS", or "INFO".
r1 <- dq_result("QC-01", "test", status = "PASS", observed = "ok", message = "ok") r2 <- dq_result("QC-02", "test", status = "WARN", observed = "ok", message = "ok") overall_status(list(r1, r2)) # "WARN"r1 <- dq_result("QC-01", "test", status = "PASS", observed = "ok", message = "ok") r2 <- dq_result("QC-02", "test", status = "WARN", observed = "ok", message = "ok") overall_status(list(r1, r2)) # "WARN"
Reads a CSV or fixed-width file, coercing all columns to character and
trimming whitespace. Encoding and delimiter are taken from config.
read_dataset(path, config)read_dataset(path, config)
path |
Character. Path to the file to read. |
config |
Named list. Merged configuration as returned by
|
A data frame with all columns as character vectors.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg)
Retrieves the n most recent run records for a given dataset from the
snapshot database, ordered newest-first.
read_recent_snapshots(db_path, dataset_name, n = 10)read_recent_snapshots(db_path, dataset_name, n = 10)
db_path |
Character. Path to the SQLite database file. |
dataset_name |
Character. Dataset name to filter on. |
n |
Integer. Maximum number of records to return. Defaults to 10. |
A data frame with one row per run and columns including
id, dataset_name, run_timestamp, file_name,
row_count, col_count, overall_status,
check_pass_count, check_warn_count, check_fail_count,
check_info_count, new_cols_vs_previous,
missing_cols_vs_previous, new_cols_vs_schema,
missing_cols_vs_schema, comparison_mode,
render_status, and type_changed_cols_vs_previous.
Returns an empty data frame if the database does not exist or contains no
records for the dataset.
history <- read_recent_snapshots(tempfile(fileext = ".sqlite"), "starwars_csv")history <- read_recent_snapshots(tempfile(fileext = ".sqlite"), "starwars_csv")
Returns the type for col from the column_types map in
config if one is set, otherwise falls back to
infer_col_type. Use this in custom check scripts instead of
calling infer_col_type() directly so that type overrides are
respected.
resolve_col_type(col, x, config)resolve_col_type(col, x, config)
col |
Character. Column name. |
x |
Character vector. The column's values (as read from the file). |
config |
Named list. Merged configuration as returned by
|
A single character string: "date", "numeric",
"character", or "unknown".
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) resolve_col_type("name", c("Luke", "Leia", "Han"), cfg) # "character"cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) resolve_col_type("name", c("Luke", "Leia", "Han"), cfg) # "character"
Runs CP-01 to CP-08 comparing a current delivery against the previous one.
run_comparison_checks(df_current, df_previous, config)run_comparison_checks(df_current, df_previous, config)
df_current |
A data frame. The current delivery. |
df_previous |
A data frame. The previous delivery. |
config |
Named list. Merged configuration as returned by
|
A list of dq_result objects. The list carries
attributes new_cols, dropped_cols, and
type_changed_cols (character vectors) for use by the snapshot
writer.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) curr_path <- system.file("demonstrations/data2/starwars_v2.csv", package = "dqcheckr") prev_path <- system.file("demonstrations/data2/starwars_v1.csv", package = "dqcheckr") curr <- read_dataset(curr_path, cfg) prev <- read_dataset(prev_path, cfg) results <- run_comparison_checks(curr, prev, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) curr_path <- system.file("demonstrations/data2/starwars_v2.csv", package = "dqcheckr") prev_path <- system.file("demonstrations/data2/starwars_v1.csv", package = "dqcheckr") curr <- read_dataset(curr_path, cfg) prev <- read_dataset(prev_path, cfg) results <- run_comparison_checks(curr, prev, cfg)
Sources the R file specified by config$custom_checks_file, which must
define a function custom_checks(df) returning a list of
dq_result objects. Returns an empty list if
custom_checks_file is not set in the config.
run_custom_checks(df, config)run_custom_checks(df, config)
df |
A data frame. The current delivery. |
config |
Named list. Merged configuration as returned by
|
The file is sourced into an isolated environment whose parent is
baseenv(), so only base R functions are available by default.
dq_result is explicitly injected and can be called without
qualification. All other dqcheckr exports (e.g. resolve_col_type,
infer_col_type) must be qualified: dqcheckr::resolve_col_type().
Any error – missing file, undefined function, or runtime failure – stops the
run with a clear message.
A list of dq_result objects (may be empty).
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) results <- run_custom_checks(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) results <- run_custom_checks(df, cfg)
Orchestrates the complete dqcheckr pipeline: loads configuration, detects files, runs QC and comparison checks, writes a snapshot to SQLite, and renders an HTML report.
run_dq_check(dataset_name, config_dir = ".", open_report = TRUE)run_dq_check(dataset_name, config_dir = ".", open_report = TRUE)
dataset_name |
Character. Name of the dataset; must match a YAML config
file |
config_dir |
Character. Path to the directory containing
|
open_report |
Logical. Whether to open the HTML report in the browser after rendering (only takes effect in interactive sessions). |
Invisibly, a named list with:
Overall status string: "PASS", "WARN",
"FAIL", or "INFO".
Absolute path to the rendered HTML report, or
NULL if rendering was skipped.
Integer row ID of the snapshot written to SQLite,
or NULL if the write failed.
tmp <- gsub("\\\\", "/", tempdir()) dat <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") writeLines(c( paste0('snapshot_db: "', tmp, '/snap.sqlite"'), paste0('report_output_dir: "', tmp, '"'), 'default_rules:', ' max_missing_rate: 0.60', ' min_row_count: 80' ), file.path(tmp, "dqcheckr.yml")) writeLines(c( 'dataset_name: "starwars_csv"', paste0('current_file: "', dat, '"'), 'format: csv', 'encoding: "UTF-8"', 'delimiter: ","' ), file.path(tmp, "starwars_csv.yml")) result <- run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) result$statustmp <- gsub("\\\\", "/", tempdir()) dat <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") writeLines(c( paste0('snapshot_db: "', tmp, '/snap.sqlite"'), paste0('report_output_dir: "', tmp, '"'), 'default_rules:', ' max_missing_rate: 0.60', ' min_row_count: 80' ), file.path(tmp, "dqcheckr.yml")) writeLines(c( 'dataset_name: "starwars_csv"', paste0('current_file: "', dat, '"'), 'format: csv', 'encoding: "UTF-8"', 'delimiter: ","' ), file.path(tmp, "starwars_csv.yml")) result <- run_dq_check("starwars_csv", config_dir = tmp, open_report = FALSE) result$status
Runs the full QC check suite (QC-01 to QC-15, SC-01, SC-02) against a single data frame snapshot.
run_qc_checks(df, config, file_path = NULL)run_qc_checks(df, config, file_path = NULL)
df |
A data frame with all columns as character vectors (as returned by
|
config |
Named list. Merged configuration as returned by
|
file_path |
Character or |
A list of dq_result objects.
cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) results <- run_qc_checks(df, cfg)cfg_dir <- system.file("demonstrations/config", package = "dqcheckr") cfg <- load_config("starwars_csv", config_dir = cfg_dir) path <- system.file("demonstrations/data/starwars.csv", package = "dqcheckr") df <- read_dataset(path, cfg) results <- run_qc_checks(df, cfg)