diff --git a/.github/workflows/ci-numba-bench.yml b/.github/workflows/ci-numba-bench.yml index 01bc151f6b..fcf2257ac5 100644 --- a/.github/workflows/ci-numba-bench.yml +++ b/.github/workflows/ci-numba-bench.yml @@ -9,6 +9,9 @@ on: - cron: "37 4 * * *" # nightly at 04:37 UTC workflow_dispatch: +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" + permissions: contents: read diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index ed59963dc0..b53429a69c 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -109,7 +109,12 @@ jobs: sbom: true - name: Generate build provenance attestation - uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2 + # v4.1.0 — composite action that internally uses + # actions/attest@v4.1.0 with `using: node24`. Bumped from v2 + # (Node-20-targeting sub-actions) on 2026-05-21 to clear the + # "Node.js 20 deprecated" annotation that appeared on the + # v0.9.4 Docker publish run (26019670710). + uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 with: subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} subject-digest: ${{ steps.build.outputs.digest }} diff --git a/.github/workflows/r-cmd-check.yml b/.github/workflows/r-cmd-check.yml index 1092c424f5..57bc7b3811 100644 --- a/.github/workflows/r-cmd-check.yml +++ b/.github/workflows/r-cmd-check.yml @@ -2,6 +2,10 @@ name: R CMD check env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" + # Parallel compile for source-installed dependencies (notably duckdb, + # which has ~50 MB of C++ source and otherwise takes 20-25 min single- + # threaded). GitHub-hosted runners are 4-vCPU / 16 GB RAM so -j4 fits. + MAKEFLAGS: "-j4" # Run R CMD check across the supported OS x R-version matrix. Required @@ -17,6 +21,11 @@ on: paths: - 'r-package/morie/**' - '.github/workflows/r-cmd-check.yml' + pull_request: + branches: [main] + paths: + - 'r-package/morie/**' + - '.github/workflows/r-cmd-check.yml' workflow_dispatch: permissions: @@ -33,7 +42,9 @@ jobs: matrix: config: - { os: macos-latest, r: 'release' } - - { os: windows-latest, r: 'release' } + # Pin Windows runner explicitly (windows-latest auto-redirects to + # windows-2025-vs2026 by 2026-06-15; pre-pin removes ambiguity). + - { os: windows-2025, r: 'release' } - { os: ubuntu-latest, r: 'release' } - { os: ubuntu-latest, r: 'devel' } - { os: ubuntu-latest, r: 'oldrel-1' } @@ -72,3 +83,65 @@ jobs: # vignettes/PDFs in CI. args: 'c("--no-manual")' error-on: '"error"' + + # Separate Linux-only job that spins up a live PostgreSQL service and + # exercises the PG branch of the DBI cache layer. The PG testthat + # block in tests/testthat/test-db-backends.R is gated on + # MORIE_PG_TEST=true; this job sets that env var so the tests run. + # The default R-CMD-check matrix above does NOT set MORIE_PG_TEST, + # so the PG block correctly skips there. + R-CMD-check-postgres: + runs-on: ubuntu-latest + name: ubuntu-latest (release) + postgres-15 + + services: + postgres: + image: postgres:15 + env: + POSTGRES_DB: testdb + POSTGRES_USER: postgres + POSTGRES_PASSWORD: password + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + env: + _R_CHECK_FORCE_SUGGESTS_: false + R_KEEP_PKG_SOURCE: yes + MORIE_PG_TEST: "true" + PGHOST: localhost + PGPORT: "5432" + PGUSER: postgres + PGPASSWORD: password + PGDATABASE: testdb + + defaults: + run: + working-directory: r-package/morie + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + + - uses: r-lib/actions/setup-pandoc@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + + - uses: r-lib/actions/setup-r@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + r-version: 'release' + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + extra-packages: any::rcmdcheck any::RPostgres any::duckdb any::withr + needs: check + + - uses: r-lib/actions/check-r-package@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + upload-snapshots: true + args: 'c("--no-manual")' + error-on: '"error"' diff --git a/.github/workflows/r-coverage-and-lint.yml b/.github/workflows/r-coverage-and-lint.yml new file mode 100644 index 0000000000..8707839b77 --- /dev/null +++ b/.github/workflows/r-coverage-and-lint.yml @@ -0,0 +1,178 @@ +name: R coverage + lint + goodpractice + +# Runs covr (with codecov.io upload), lintr, and goodpractice on every +# push to main / PR. Complements r-cmd-check.yml — which only runs +# R CMD check across the OS × R-version matrix. Together they cover +# the full rOpenSci pkgcheck surface (rcmdcheck + covr + cyclocomp + +# lintr + goodpractice). + +on: + push: + branches: [main] + paths: + - 'r-package/morie/**' + - '.github/workflows/r-coverage-and-lint.yml' + pull_request: + branches: [main] + paths: + - 'r-package/morie/**' + workflow_dispatch: + +env: + # Force every JavaScript-based action (codecov, upload-artifact, etc.) + # to use Node.js 24 instead of the deprecated Node.js 20. + # Node 20 is scheduled for removal from runners on 2026-09-16. + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" + # Parallel compile for source-installed Suggests (notably duckdb, + # which has ~50 MB of C++ source). 4-vCPU runners + -j4 cuts the + # dependency-install step from ~25 min single-threaded to ~5 min. + MAKEFLAGS: "-j4" + +permissions: + contents: read + +jobs: + coverage: + runs-on: ubuntu-latest + name: covr + codecov upload + + env: + _R_CHECK_FORCE_SUGGESTS_: false + R_KEEP_PKG_SOURCE: yes + + defaults: + run: + working-directory: r-package/morie + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + - uses: r-lib/actions/setup-r@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + r-version: 'release' + use-public-rspm: true + - uses: r-lib/actions/setup-r-dependencies@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + extra-packages: any::covr any::xml2 + needs: coverage + + - name: Run covr + run: | + Rscript -e 'cov <- covr::package_coverage(type = "tests", quiet = TRUE); covr::to_cobertura(cov, filename = "coverage.xml"); cat(sprintf("Coverage: %.2f%%\n", covr::percent_coverage(cov)))' + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + files: r-package/morie/coverage.xml + flags: r-tests + fail_ci_if_error: false # don't block on codecov outages + continue-on-error: true + + lint: + runs-on: ubuntu-latest + name: lintr + + defaults: + run: + working-directory: r-package/morie + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + - uses: r-lib/actions/setup-r@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + r-version: 'release' + use-public-rspm: true + - uses: r-lib/actions/setup-r-dependencies@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + extra-packages: any::lintr + + - name: Run lintr + run: | + Rscript -e 'lintr::lint_package()' + + goodpractice: + runs-on: ubuntu-latest + name: goodpractice (covr + cyclocomp + lintr + rcmdcheck) + + env: + _R_CHECK_FORCE_SUGGESTS_: false + + defaults: + run: + working-directory: r-package/morie + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + - uses: r-lib/actions/setup-pandoc@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + - uses: r-lib/actions/setup-r@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + r-version: 'release' + use-public-rspm: true + - uses: r-lib/actions/setup-r-dependencies@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + extra-packages: any::goodpractice any::rcmdcheck any::covr any::cyclocomp any::lintr + + - name: Run goodpractice + run: | + Rscript -e 'g <- goodpractice::gp("."); print(g)' + + pkgcheck: + # rOpenSci's own check — runs pkgstats + covr + goodpractice + rcmdcheck + # internally and produces the markdown comment the rOpenSci bot uses. + runs-on: ubuntu-latest + name: rOpenSci pkgcheck + + defaults: + run: + working-directory: r-package/morie + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + - uses: r-lib/actions/setup-pandoc@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + - uses: r-lib/actions/setup-r@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + r-version: 'release' + use-public-rspm: true + + # pkgcheck's internal rcmdcheck builds the PDF manual, which needs + # the `inconsolata` LaTeX font. Without it pkgcheck reports a + # spurious "R CMD check found 1 warning" against a package that + # otherwise has 0 warnings in the dedicated r-cmd-check.yml matrix. + - uses: r-lib/actions/setup-tinytex@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + - name: Install inconsolata + helpers for PDF manual build + run: | + tlmgr update --self + tlmgr install inconsolata helvetic times courier ec parskip url upquote + + - name: Install system deps for pkgstats (universal-ctags) + run: | + sudo apt-get update -y + sudo apt-get install -y universal-ctags + + - uses: r-lib/actions/setup-r-dependencies@a51a8012b0aab7c32ef9d19bf54da93f3254335e # v2 + with: + working-directory: r-package/morie + extra-packages: | + any::rcmdcheck + any::goodpractice + any::cyclocomp + ropensci-review-tools/pkgstats + ropensci-review-tools/pkgcheck + + - name: Run pkgcheck::pkgcheck() + checks_to_markdown() + env: + # pkgcheck queries the GitHub GraphQL API for repo metadata. Use + # the runner's auto-provided token instead of unauthenticated + # requests (which hit the 60-req/hr rate limit instantly). + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + Rscript -e 'cks <- pkgcheck::pkgcheck("."); md <- pkgcheck::checks_to_markdown(cks); writeLines(md, "pkgcheck.md"); cat(md)' + + - name: Upload pkgcheck markdown + uses: actions/upload-artifact@v5 + with: + name: pkgcheck-markdown + path: r-package/morie/pkgcheck.md diff --git a/.gitignore b/.gitignore index df748a7ce4..92ced9f5c8 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,4 @@ papers/PAPERS_*.txt RELEASE-EMAILS-*.md *-EMAILS-*.md morie-private/ +data-snapshots/ diff --git a/.zenodo.json b/.zenodo.json index 78f45af06a..692126ec27 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -55,5 +55,5 @@ "Goffman, E. (1961). Asylums: Essays on the Social Situation of Mental Patients and Other Inmates. Anchor Books." ], "notes": "MORIE is licensed AGPL-3.0-or-later (strong copyleft; see LICENSE). The optional Linux-kernel adjuncts under kernel-module/ and daemon/ are GPL-2.0-only (kernel ABI requirement). The Ontario SIU scraper is additionally subject to the Ontario open-government licence on the source corpus.", - "version": "0.9.4" + "version": "0.9.5" } diff --git a/CITATION.cff b/CITATION.cff index 2c468db254..789a555e38 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,7 +8,7 @@ authors: name-particle: "Singh" # morie pronunciation: /ˈmɔɪraɪ/ -- "MOY-rye", like the Greek Moirai (the Fates) # IPA: /ʋənʃ sɪŋ ɾʊˈɦeːlɑː/ -version: "0.9.4" +version: "0.9.5.2" date-released: "2026-05-18" url: "https://github.com/hadesllm/morie" repository-code: "https://github.com/hadesllm/morie" @@ -37,7 +37,7 @@ preferred-citation: given-names: "Vansh Singh" orcid: "https://orcid.org/0009-0004-1750-3592" year: "2026" - version: "0.9.4" + version: "0.9.5.2" doi: "10.5281/zenodo.20111233" url: "https://doi.org/10.5281/zenodo.20111233" references: @@ -48,7 +48,7 @@ references: given-names: "Vansh Singh" orcid: "https://orcid.org/0009-0004-1750-3592" year: "2026" - version: "0.9.4" + version: "0.9.5.2" doi: "10.5281/zenodo.20096350" url: "https://doi.org/10.5281/zenodo.20096350" notes: "Python companion software paper (same toolkit, Python implementation)." diff --git a/Dockerfile b/Dockerfile index 1b071d13e1..4d335c9458 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,7 @@ RUN R CMD INSTALL --library=/usr/local/lib/R/site-library /build/r-package/morie # ─── Stage 3: Runtime ──────────────────────────────────────────────────────── FROM python:${PYTHON_VERSION}-slim AS runtime -ARG VERSION=0.9.4 +ARG VERSION=0.9.5 ARG VCS_REF=unknown ARG BUILD_DATE=unknown diff --git a/README.md b/README.md index 2097f6f83f..cf72eac642 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,9 @@ A multi-domain scientific computing toolkit (Python and R) for observational inference, with sociolegal, signal-processing, cryptographic, spatial-statistics, statistical-physics, and psychometrics modules. Hosts the MRM framework as a primary application for Canadian carceral, police, and oversight data analysis. +[![R CMD check](https://github.com/hadesllm/morie/actions/workflows/r-cmd-check.yml/badge.svg)](https://github.com/hadesllm/morie/actions/workflows/r-cmd-check.yml) +[![CI](https://github.com/hadesllm/morie/actions/workflows/ci.yml/badge.svg)](https://github.com/hadesllm/morie/actions/workflows/ci.yml) +[![CodeQL](https://github.com/hadesllm/morie/actions/workflows/codeql.yml/badge.svg)](https://github.com/hadesllm/morie/actions/workflows/codeql.yml) [![License: AGPL-3.0-or-later](https://img.shields.io/badge/license-AGPL--3.0--or--later-a42e2b.svg)](https://github.com/hadesllm/morie/blob/main/LICENSE) [![PyPI version](https://img.shields.io/pypi/v/morie.svg)](https://pypi.org/project/morie/) [![r-universe](https://img.shields.io/badge/r--universe-hadesllm-276DC3)](https://hadesllm.r-universe.dev/morie) @@ -96,7 +99,7 @@ pip install morie docker run --rm ghcr.io/hadesllm/morie:latest morie --help # Pin to a specific version (recommended for reproducibility) -docker run --rm ghcr.io/hadesllm/morie:0.9.4 morie --help +docker run --rm ghcr.io/hadesllm/morie:0.9.5.2 morie --help ``` Multi-arch image published on every release with both versioned and `:latest` tags. Requires only Docker — no Python, no pip. @@ -131,6 +134,19 @@ result = analyze_a01_mrm(df) print(result) ``` +## What's new in v0.9.5 + +- **SIU subsystem — first-class.** A full pipeline for the Ontario Special Investigations Unit director's-report corpus (English + French, 2005-present): `morie_fetch_siu()` with a polite token-bucket fetcher (4 req/s default, exponential backoff on 429/5xx, optional on-disk page cache), a hand-rolled C++ parser (`src/siu_parser.cpp`) that handles both 2015-2019 and 2020+ template families plus 2014 *Overview* and 2005 *Director's report* variants, 38 police-service acronyms (English + French) mapped to canonical English names, compound officer count handling, and a linear `html_to_text` state machine replacing the segfault-prone `std::regex_replace`. +- **Language-aware DRID manifest.** `inst/extdata/siu_drid_manifest.csv.gz` ships with 4,743 probed drids (en=2,531, fr=2,212, unknown=0) and a `canonical_drid` column for English-preferred dedupe. `morie_fetch_siu(lang = "en")` skips French drids — half the network round-trips. `morie_siu_index()` exposes the manifest. +- **Canonical override system — the parser learns.** `inst/extdata/siu_canonical_overrides.csv.gz` ships with 47 hand-verified corrections; `morie_siu_record_correction(case_number, field, value)` lets users add their own. Overrides are applied automatically at the end of every fetch. +- **Audit + AI tooling.** `morie_siu_audit_case()`, `morie_siu_compare()`, `morie_siu_sanity_check()`, `morie_siu_anomaly_check()`, `morie_siu_audit_columns()`, `morie_siu_translate()`, and `morie_siu_llm_extract()` with four providers — `ollama` (default, local, free), `gemini`, `claude`, `vertex` — and a `c("ollama", "gemini")` failover chain so paid APIs only fire when the local model fails. Defaults: `OLLAMA_HOST=http://localhost:11434`, `OLLAMA_MODEL=gemma3:4b`, `OLLAMA_KEEP_ALIVE=30m`. French → English translation via `translategemma:latest`. +- **559 exported `morie_*` R functions — every public callable now prefixed.** Cleared rOpenSci `pkgcheck`'s duplicated-function-names finding by renaming 352 unprefixed exports to `morie_*` across `R/`, `tests/`, `vignettes/`, `inst/`, and `data-raw/`. No aliases — the unprefixed names are gone from `NAMESPACE`. +- **TPS open-data ingestion fixes** (carried over from the original v0.9.5 plan). Corrected the Homicides and Shootings date ranges in the dataset catalog (`2004-present`, not `2014`); rewrote `morie_fetch_tps()` ArcGIS paging to follow the server's `exceededTransferLimit` flag so large layers are no longer silently truncated to the first page; daily-resolution Hawkes fits now build the occurrence date from the local-time `OCC_YEAR`/`OCC_MONTH`/`OCC_DAY` fields rather than the UTC-converted `OCC_DATE`. +- **`T_horizon` rename in the Hawkes C++ likelihood.** The time-horizon parameter was bare `T` in the auto-generated `R/RcppExports.R`, which `lintr` flags as a potential `TRUE` shadow. The C++ signature is now `T_horizon`; the math convention is preserved in C++ docstrings only. +- **rOpenSci 770 blockers cleared.** `.github/CONTRIBUTING.md` shipped, 16 `@return` docs added, 15 `@examples` added, full roxygen2 conversion (RoxygenNote 7.3.3), coverage validated ≥75% under `covr::package_coverage`, `\dontrun{}` count 72 → 0, `setwd()` replaced with `withr::local_dir()` in `R/workflow.R`. +- **Five-cell R CMD check matrix all green** on `release/v0.9.5-audit`: macOS-latest release, Windows-2025 release, Ubuntu-latest release, Ubuntu-latest release + postgres-15, Ubuntu-latest oldrel-1, Ubuntu-latest devel. Plus `pkgcheck`, `covr` + Codecov upload, `lintr`, `goodpractice`, and CodeQL. +- **Final SIU corpus stats**: 2,218 unique cases × 64 columns, 100.000% format-clean per `morie_siu_sanity_check()`. + ## What's new in v0.9.4 - **CRAN source-package compliance.** The R package's vendored copy of the shared C++ core header was renamed `morie_core.hpp` → `morie_core.h`. `R CMD check --as-cran` does not recognize `.hpp` as a `src/` file extension and warned about it; the rename clears the WARNING. No behaviour change — the canonical `libmorie/morie_core.hpp` (Python/CMake side) is unchanged. @@ -223,12 +239,12 @@ and the **empirical applications paper**. ``` # Software paper — R (also the R package source on Zenodo) Ruhela, V. S. (2026). morie: Multi-domain Open Research and Inferential -Estimation in R (v0.9.4). Zenodo. +Estimation in R (v0.9.5.2). Zenodo. https://doi.org/10.5281/zenodo.20111233 # Software paper — Python (also the Python package source on Zenodo) Ruhela, V. S. (2026). morie: Multi-domain Open Research and Inferential -Estimation in Python (v0.9.4). Zenodo. +Estimation in Python (v0.9.5.2). Zenodo. https://doi.org/10.5281/zenodo.20096350 # MRM framework paper (theoretical foundations) diff --git a/papers/hawkes-paper/main.tex b/papers/hawkes-paper/main.tex index a3894ef8f8..7e34b980ae 100644 --- a/papers/hawkes-paper/main.tex +++ b/papers/hawkes-paper/main.tex @@ -35,10 +35,10 @@ \newtheorem{definition}[theorem]{Definition} %% --- title / author block (jss conventions) --- -\title{Criminological Hawkes Process via MORIE: Markovian and +\title{Criminological Hawkes Process via morie: Markovian and Non-Markovian Self-Exciting Point Processes for Toronto Crime} -\Plaintitle{Criminological Hawkes Processes via MORIE} -\Shorttitle{Criminological Hawkes Process via MORIE} +\Plaintitle{Criminological Hawkes Processes via morie} +\Shorttitle{Criminological Hawkes Process via morie} \author{Vansh Singh Ruhela~\orcidlink{0009-0004-1750-3592}\\University of Toronto} \Plainauthor{Vansh Singh Ruhela} @@ -55,7 +55,7 @@ \Abstract{ This is a likelihood-based inference study for Hawkes self-exciting point processes applied to incident-level open data from the Toronto -Police Service. The classical Mohler--Bertozzi--Brantingham model +Police Service. The classical Mohler--Short--Brantingham model \citep{Mohler2011,mohler2014marked} assumes a constant baseline $\nu$ and an exponential excitation kernel $g(t)=\beta e^{-\beta t}$; under those assumptions the @@ -69,11 +69,11 @@ information for the general model, summarise the asymptotic-ergodicity argument that secures strong consistency and asymptotic normality of the MLE, and implement all of it as the new -\code{morie.tps\_hawkes\_advanced} module of the MORIE package +\code{morie.tps\_hawkes\_advanced} module of the \pkg{morie} package \citep{Ruhela2026MorieR}, where the fitted comparators are also exposed as MRM modules within the MRM framework \citep{Ruhela2026MRM}. Eight (kernel $\times$ baseline) combinations -are fit to each TPS incident category and ranked by AIC and +are fit to the TPS Assault incident series and ranked by AIC and time-rescaling goodness-of-fit. For Toronto Assault data, the non-Markovian Weibull-kernel $\times$ sinusoidal-baseline model is preferred over the Markovian classical Hawkes by $\Delta\mathrm{AIC} > 0$ @@ -87,10 +87,10 @@ non-Markovian kernel, criminology, Toronto Police Service, maximum likelihood estimation, time-rescaling theorem, R, Python} % --- morie version stamp ----------------------------------------------------- -% This paper documents \pkg{morie} v0.6.1 (released 2026-05-13). Specific +% This paper documents \pkg{morie} v0.9.5 (released 2026-05-18). Specific % function names, dataset identifiers, and CLI subcommands shown below % reference the morie public API as of that release; backwards-compatibility -% aliases keep v0.x code references resolving across the v0.4.x series. +% aliases keep v0.x code references resolving across the v0.x series. % ----------------------------------------------------------------------------- @@ -143,8 +143,8 @@ \section{Introduction}\label{sec:intro} \section{Setup and notation}\label{sec:setup} Let $0 \le t_1 < t_2 < \cdots < t_n \le T$ be the observed event -times of a simple point process $N$ on $[0,T]$ with $\mathcal{F}_t = -\sigma\{N_s : 0 \le s \le t\}$-conditional intensity +times of a simple point process $N$ on $[0,T]$, with $\mathcal{F}_t = +\sigma\{N_s : 0 \le s \le t\}$ the natural filtration. Decompose the kernel as $g(u) = \eta\,\tilde g(u; \psi)$ with $\eta \in (0,1)$ the branching ratio and $\tilde g(\cdot;\psi)$ a parametric density on $[0,\infty)$. Then @@ -385,8 +385,8 @@ \subsection{8-way kernel × baseline grid on TPS Assault 2014--2026}\label{sec:a $[0,\,T)$), fit at full sample under the compiled C++ likelihood core that this release ships ($O(n)$ recursive evaluation for the exponential kernel; sub-quadratic truncated and sum-of-exponentials -paths for the non-Markovian kernels). Section~\ref{sec:application}.B retains the original -$n=2{,}000$ sub-sample for backwards comparability with the prior sub-sampled analyses; readers seeking the canonical post-v0.2.1 Hawkes +paths for the non-Markovian kernels). Section~\ref{sec:application-8way} retains the original +$n=2{,}000$ sub-sample for backwards comparability with the prior sub-sampled analyses; readers seeking the canonical post-v0.9.1 Hawkes characterisation of TPS Assault should rely on Table~\ref{tab:tps-fit-2019}. @@ -491,7 +491,7 @@ \subsection{2019 contiguous-year re-fit (full-sample, compiled core)}\label{sec: substantial margin even after the four extra parameters of the sinusoidal baseline. By contrast, switching only the kernel (holding the baseline constant) gains essentially nothing -($\Delta\mathrm{AIC} \le 1.0$). The substantive conclusion is +($\Delta\mathrm{AIC} \le 2$). The substantive conclusion is that for Toronto Assault data the dominant departure from the classical model is the time-varying baseline, not the kernel shape --- but the KCD framework remains the inferential engine that @@ -504,8 +504,10 @@ \subsection{2019 contiguous-year re-fit (full-sample, compiled core)}\label{sec: mode away from $u=0$) and the baseline is no longer constant ($\hat\alpha_2,\hat\alpha_3$ encode an annual-cycle modulation of $\nu(t)$). All four sinusoidal-baseline fits dominate the -Markovian classical, and the within-baseline gap between kernels -is small ($\le 50$ AIC), so for these data the time-varying +Markovian classical, and although the kernels do separate under +the sinusoidal baseline (a within-baseline spread of about $50$ +AIC), that spread is still far below the $\approx 141$-AIC gain +from the baseline switch itself, so for these data the time-varying baseline is the dominant non-Markovian feature. \paragraph{Branching ratio and explosivity.} Across all eight @@ -520,7 +522,7 @@ \subsection{2019 contiguous-year re-fit (full-sample, compiled core)}\label{sec: \subsection[Reproducibility code]{Reproducibility: code for Tables~\ref{tab:tps-fit} and~\ref{tab:tps-fit-2019}} \label{sec:application-reproducibility} -Both tables are reproducible end-to-end from a fresh install of MORIE +Both tables are reproducible end-to-end from a fresh install of \pkg{morie} \citep{Ruhela2026MoriePy} against the public TPS Assault feed. \paragraph{Table~\ref{tab:tps-fit}: 8-way grid on $n=2{,}000$ sub-sample.} @@ -634,7 +636,7 @@ \section{Conclusion}\label{sec:conclusion} We have given a self-contained derivation of the likelihood, score, and information for a non-stationary, non-Markovian Hawkes process, summarised the asymptotic guarantees of Kwan-Chen-Dunsmuir, and -implemented the full machinery in MORIE. Markovian and +implemented the full machinery in \pkg{morie}. Markovian and non-Markovian fits are now comparable on the same footing, both on synthetic stress-tests and on Toronto Police Service incident feeds. The classical Mohler model is no longer the only Hawkes @@ -642,7 +644,7 @@ \section{Conclusion}\label{sec:conclusion} \section*{Acknowledgements} The Toronto Police Service open-data programme -(\texttt{data.torontopolice.on.ca}) provides the incident-level +(\url{https://data.torontopolice.on.ca}) provides the incident-level feeds analysed here. \paragraph*{AI assistance.} diff --git a/papers/hawkes-paper/refs.bib b/papers/hawkes-paper/refs.bib index a03b82d1a3..74045c03fd 100644 --- a/papers/hawkes-paper/refs.bib +++ b/papers/hawkes-paper/refs.bib @@ -276,7 +276,6 @@ @article{Hawkes1971 doi = {10.1093/biomet/58.1.83} } -doi = {10.1109/TIT.1981.1056305} @article{Ogata1981, author = {Ogata, Yosihiko}, title = {On {L}ewis' simulation method for point processes}, @@ -284,7 +283,8 @@ @article{Ogata1981 year = {1981}, volume = {27}, number = {1}, - pages = {23--31} + pages = {23--31}, + doi = {10.1109/TIT.1981.1056305} } @book{Daley2003, @@ -392,7 +392,7 @@ @misc{Ruhela2026MorieR title = {morie: Multi-domain Open Research and Inferential Estimation in {R}}, year = {2026}, doi = {10.5281/zenodo.20111233}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026MoriePy, @@ -400,7 +400,7 @@ @misc{Ruhela2026MoriePy title = {morie: Multi-domain Open Research and Inferential Estimation in {Python}}, year = {2026}, doi = {10.5281/zenodo.20096350}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026Hawkes, @@ -415,14 +415,14 @@ @misc{Ruhela2026Hawkes % (Dropped duplicate mohler2011self — same paper as Mohler2011.) -doi = {10.1073/pnas.0910921107} @article{short2010pde, author = {Short, Martin B. and Brantingham, P.\ Jeffrey and Bertozzi, Andrea L. and Tita, George E.}, title = {Dissipation and displacement of hotspots in reaction-diffusion models of crime}, journal = {Proceedings of the National Academy of Sciences}, year = {2010}, volume = {107}, - pages = {3961--3965} + pages = {3961--3965}, + doi = {10.1073/pnas.0910921107} } % (Dropped duplicate daley2003intro — same paper as Daley2003.) @@ -436,14 +436,14 @@ @article{oakes1975markovian pages = {69--77} } -doi = {10.2307/2999632} @article{engle1998autoregressive, author = {Engle, Robert F. and Russell, Jeffrey R.}, title = {Autoregressive Conditional Duration: A New Model for Irregularly Spaced Transaction Data}, journal = {Econometrica}, year = {1998}, volume = {66}, - pages = {1127--1162} + pages = {1127--1162}, + doi = {10.2307/2999632} } @unpublished{kwan2023asymptotic, @@ -453,14 +453,14 @@ @unpublished{kwan2023asymptotic year = {2023} } -doi = {10.7326/M20-0504} @article{lauer2020incubation, author = {Lauer, Stephen A. and Grantz, Kyra H. and Bi, Qifang and Jones, Forrest K. and Zheng, Qulu and Meredith, Hannah R. and Azman, Andrew S. and Reich, Nicholas G. and Lessler, Justin}, title = {The incubation period of coronavirus disease 2019 ({COVID-19})}, journal = {Annals of Internal Medicine}, year = {2020}, volume = {172}, - pages = {577--582} + pages = {577--582}, + doi = {10.7326/M20-0504} } @article{escobar2020hawkes, @@ -470,14 +470,14 @@ @article{escobar2020hawkes year = {2020} } -doi = {10.1142/S2382626615500057} @article{bacry2015hawkes, author = {Bacry, Emmanuel and Mastromatteo, Iacopo and Muzy, Jean-Fran{\c{c}}ois}, title = {Hawkes processes in finance}, journal = {Market Microstructure and Liquidity}, year = {2015}, volume = {1}, - number = {1} + number = {1}, + doi = {10.1142/S2382626615500057} } @book{gould2013limit, diff --git a/papers/morie-empirical-paper/main.tex b/papers/morie-empirical-paper/main.tex index 0ebbdc2597..3ebed66255 100644 --- a/papers/morie-empirical-paper/main.tex +++ b/papers/morie-empirical-paper/main.tex @@ -62,8 +62,10 @@ recovers a duration-only Torture$^{\ast}$ proxy rising from $12.5\%$ in 2023 to $20.6\%$ in 2025 (provincial segregation) and from $31.5\%$ to $40.9\%$ under broader restrictive -confinement, exceeding the federal $9.9\%$ \citep{SprottDoob2023} -by $+10.7$ percentage points at peak. Statistical-physics +confinement. The 2025 provincial segregation rate exceeds the +federal $9.9\%$ \citep{SprottDoob2021} by $+10.7$ percentage +points, and the 2025 restrictive-confinement rate exceeds it by +$+31.0$ percentage points. Statistical-physics diagnostics on the Toronto Police Service crime corpus support a non-Markovian (Weibull/sinusoidal) Hawkes preference over the classical Markovian specification, and Goffmanian institutional @@ -81,10 +83,10 @@ Hawkes process, MRM framework, sociolegal statistics, R, Python} % --- morie version stamp ----------------------------------------------------- -% This paper documents \pkg{morie} v0.6.1 (released 2026-05-13). Specific +% This paper documents \pkg{morie} v0.9.5 (released 2026-05-18). Specific % function names, dataset identifiers, and CLI subcommands shown below % reference the morie public API as of that release; backwards-compatibility -% aliases keep v0.x code references resolving across the v0.4.x series. +% aliases keep v0.x code references resolving across the v0.x series. % ----------------------------------------------------------------------------- @@ -428,7 +430,7 @@ \section{Descriptive statistics} \begin{table}[h] \centering -\caption{Sample structure of \OTIS{} b01 over fiscal 2023--2025 +\caption{Sample structure of \OTIS{} a01 over fiscal 2023--2025 (verified ground-truth counts from the published OTIS-RC analysis; reproducible from \code{morie::mrm\_otis\_load()}).} \label{tab:otis-counts} @@ -485,10 +487,12 @@ \subsection{Pooled 2023--2025 DML estimates} Both estimands are statistically distinguishable from zero at every conventional level; the t-statistics correspond to $p < 10^{-143}$ for the ATE and $p < 10^{-145}$ for the ATTE. The point estimates -correspond to a $16$ percentage-point increase in the probability -that the most recent placement is in a different region than the -segregation-event placement, conditional on the alert-complexity -indicator switching from low to high. +correspond to an increase of approximately $0.16$ regional +transitions per person-year in the expected count $\vm$, +conditional on the alert-complexity indicator switching from low to +high. Because $\vm$ is a non-negative transition count, not a +binary indicator, this is a difference in expected counts and not a +change in a probability. \subsection{Per-year DML estimates: rising trend} \label{sec:dml-per-year} @@ -551,7 +555,13 @@ \subsection{Multi-way clustered standard errors} \end{table} The point estimate is concentrated in a narrow band, and the -inference is robust to the choice of clustering scheme. The min and +inference is robust to the choice of clustering scheme. Each cell +of the grid is a separate region-contrast-specific estimate +re-estimated under one clustering scheme; the clustering scheme +alters only the standard errors, while the point-estimate spread +across the band reflects genuine heterogeneity across the regional +contrasts rather than the pooled ATE of $0.1605$ +(Table~\ref{tab:res-pool}) re-clustered. The min and max in Table~\ref{tab:res-clustering} attach to specific cells: the minimum $\hat{\tau} = 0.1932$ corresponds to the Eastern-cluster\_id-ATTE specification, and the maximum $\hat{\tau} = 0.2013$ corresponds to @@ -598,7 +608,7 @@ \subsection{Identification robustness summary} \subsection{Federal operationalisation} \label{sec:mandela-federal} -\citet{SprottDoob2023} jointly operationalise Rules 43 and 44 on +\citet{SprottDoob2021} jointly operationalise Rules 43 and 44 on federal Structured Intervention Unit person-stays: \begin{equation} \label{eq:federal-mandela} @@ -613,7 +623,7 @@ \subsection{Federal operationalisation} inmate did not receive the legally-required four hours out-of-cell. Across $N = 1{,}960$ federal SIU person-stays (November 2019 to September 2020), the classifier yields $28.4\%$ Solitary, $9.9\%$ -Torture, and $61.7\%$ All other \citep{SprottDoob2023}. +Torture, and $61.7\%$ All other \citep{SprottDoob2021}. \subsection{Provincial operationalisation} \label{sec:mandela-provincial} @@ -644,13 +654,14 @@ \subsection{Cross-jurisdiction comparison} \label{sec:mandela-cross-jurisdiction} Table~\ref{tab:res-mandela} reports the Mandela classification for the -federal Structured Intervention Unit regime \citep{SprottDoob2023} +federal Structured Intervention Unit regime \citep{SprottDoob2021} alongside the provincial \OTIS{} Segregation and broader Restrictive Confinement views, by fiscal year. The provincial Segregation proportion classified as Torture$^{\ast}$ rises from $12.5\%$ in -fiscal 2023 to $20.6\%$ in fiscal 2025, a peak gap of $+10.7$ -percentage points relative to the federal $9.9\%$ that -\citet{SprottDoob2023} report. Under the broader +fiscal 2023 to $20.6\%$ in fiscal 2025; the 2025 segregation rate +exceeds the federal $9.9\%$ that \citet{SprottDoob2021} report by +$+10.7$ percentage points (and the broader restrictive-confinement +rate exceeds it by $+31.0$ points). Under the broader restrictive-confinement view (\OTIS{} c11 \texttt{NumberIndividuals\_RestrictiveConfinement} column), the provincial Torture$^{\ast}$ rate reaches $40.9\%$ in fiscal 2025. @@ -660,7 +671,7 @@ \subsection{Cross-jurisdiction comparison} \caption{Cross-jurisdiction Mandela classification. Federal: person-stays, full operationalisation~\eqref{eq:federal-mandela}. Provincial: individuals, duration-only proxy~\eqref{eq:provincial-mandela}. -Source: \citealt{SprottDoob2023} (federal) and the official Ontario +Source: \citealt{SprottDoob2021} (federal) and the official Ontario \OTIS{} c11 aggregate (provincial). All provincial numbers reproducible via \code{morie::mrm\_classify\_mandela(source = "c11\_aggregate")}.} @@ -732,7 +743,7 @@ \subsection{Per-category Hawkes fits (Markovian)}\label{phys-hawkes} \paragraph*{Verification status.} -carried from prior analyses; MA-thesis values pending re-fit against the maintained TPS Assault CSV.\footnote{\texttt{moirais-dev/dev/sphinx/project/data/datasets/TPS/Assault/CSV/Assault\_Open\_Data\_*.csv}} +carried from prior analyses; MA-thesis values pending re-fit against the maintained TPS Assault open-data series.\footnote{Toronto Police Service Assault Open Data, published on the TPS Public Safety Data Portal (ArcGIS open-data layer). The \texttt{morie} package retrieves it reproducibly via \texttt{morie\_fetch\_tps("Assault")}, so no local path is required.} Table~\ref{tab:hawkes-markovian} reports the Markovian (M1) Hawkes maximum-likelihood fits per TPS category, refitted with U(0,1)-day @@ -1160,7 +1171,7 @@ \subsection{Crime Severity Index --- robustness check}\label{phys-csi} homicides drop but minor thefts rise will see falling CSI even if absolute crime counts are flat. -Table~\ref{tab:csi-toronto} reports MORIE-computed Toronto CSI +Table~\ref{tab:csi-toronto} reports \pkg{morie}-computed Toronto CSI for 2014--2025 from the \texttt{morie.tps\_csi} module, rebased to 2014 = 100. Both the Total and Violent variants are shown. Source weights: @@ -1169,7 +1180,7 @@ \subsection{Crime Severity Index --- robustness check}\label{phys-csi} \begin{table}[h] \centering\footnotesize -\caption{Toronto Crime Severity Index, FY2014-2025, MORIE-computed +\caption{Toronto Crime Severity Index, FY2014-2025, morie-computed (\texttt{morie.tps\_csi.csi\_per\_year} on the 9 TPS open-data feeds, rebased so 2014 = 100). Total CSI weights all nine TPS categories; Violent CSI zeros out non-violent (B\&E, Auto Theft, @@ -1206,7 +1217,7 @@ \subsection{Crime Severity Index --- robustness check}\label{phys-csi} \paragraph{Cross-check against StatsCan published values.} StatsCan's published Toronto CSI for 2023 is approximately $60.4$ (Total) and $71.8$ (Violent) on its national 2006-baseline scale. -The MORIE module reports values at a different scale (raw +The \pkg{morie} module reports values at a different scale (raw weighted sum per 100k) because it does not have access to the 2006 national-baseline normalisation factor; rebasing locally to 2014 = 100 (as in Table~\ref{tab:csi-toronto}) gives a clean @@ -1224,11 +1235,15 @@ \subsection{Crime Severity Index --- robustness check}\label{phys-csi} and gives a pooled $\hat\tau_{\text{ATE}} = 0.213$ (cluster-robust SE $0.012$, $n=14{,}520$, $95\%$ CI $[0.189,\,0.236]$) with $\hat\tau_{\text{ATTE}}=0.213$ and $\hat\tau_{\text{ATC}}=0.212$. -The ATE is stable to within $\le 0.002$ of the CSI-blind estimate, -so the severity of the surrounding crime environment is not a -substantial confounder of the alert-complexity~$\to$~placement-volatility -relationship --- the DML ensemble already absorbs it through the -demographic, regional and seasonal controls. +Within this CSI-augmented re-fit the ATE, ATTE and ATC agree to +within $\le 0.002$ of one another ($0.213/0.213/0.212$). A direct +comparison to the CSI-blind §\ref{sec:dml} estimate is deferred: +that pipeline is run on a different sample, so the two are not +estimand-identical, and these overlay numbers themselves carry the +not-yet-re-run status of this subsection. We therefore read the +overlay as indicative that conditioning on environmental severity +does not overturn the positive alert-complexity effect, pending a +matched re-fit. \subsection{Extended spatial diagnostics}\label{phys-spatial-extended} @@ -1464,7 +1479,7 @@ \section{Reproducibility code -- full morie commands for every reported result} R> # Hawkes fits are produced offline; the maintained per-category R> # parameter manifest ships with the package R> refit_path <- system.file("extdata", -+ "tps_hawkes_refit_v0.6.1.json", package = "morie") ++ "tps_hawkes_refit.json", package = "morie") R> refit <- mrm_tps_load_hawkes_refit(refit_path) R> # Selection table: AIC ranking, KS p-values per kernel/baseline R> subset(refit$summary, crime == "Assault") diff --git a/papers/morie-empirical-paper/refs.bib b/papers/morie-empirical-paper/refs.bib index 04cb6dcab9..912faf82eb 100644 --- a/papers/morie-empirical-paper/refs.bib +++ b/papers/morie-empirical-paper/refs.bib @@ -20,20 +20,6 @@ @techreport{SprottDoob2021 note = {Verified URL 2026-05-14; the headline 9.9\% federal Mandela ``torture'' figure is published here.} } -% SprottDoob2023 is kept as an alias key for backward-compat with existing -% \citet{SprottDoob2023} calls. The headline 9.9% figure is from the -% February 2021 report above; the year-2023 key reflects the date our -% papers were drafted, not the report's publication year. -@techreport{SprottDoob2023, - author = {Sprott, Jane B. and Doob, Anthony N.}, - title = {Solitary Confinement, Torture, and {C}anada's Structured Intervention Units}, - institution = {Centre for Criminology and Sociolegal Studies, University of Toronto}, - year = {2021}, - month = feb, - type = {Report}, - url = {https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf}, - note = {Verified URL 2026-05-14; alias of the Sprott \& Doob 2021 report.} -} @techreport{DoobSprott2020, author = {Doob, Anthony N. and Sprott, Jane B.}, @@ -276,7 +262,6 @@ @article{Hawkes1971 doi = {10.1093/biomet/58.1.83} } -doi = {10.1109/TIT.1981.1056305} @article{Ogata1981, author = {Ogata, Yosihiko}, title = {On {L}ewis' simulation method for point processes}, @@ -284,7 +269,8 @@ @article{Ogata1981 year = {1981}, volume = {27}, number = {1}, - pages = {23--31} + pages = {23--31}, + doi = {10.1109/TIT.1981.1056305} } @book{Daley2003, @@ -392,7 +378,7 @@ @misc{Ruhela2026MorieR title = {morie: Multi-domain Open Research and Inferential Estimation in {R}}, year = {2026}, doi = {10.5281/zenodo.20111233}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026MoriePy, @@ -400,7 +386,7 @@ @misc{Ruhela2026MoriePy title = {morie: Multi-domain Open Research and Inferential Estimation in {Python}}, year = {2026}, doi = {10.5281/zenodo.20096350}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026Hawkes, diff --git a/papers/morie-py-paper/main.tex b/papers/morie-py-paper/main.tex index bc380ca877..f0fed9a4c9 100644 --- a/papers/morie-py-paper/main.tex +++ b/papers/morie-py-paper/main.tex @@ -61,7 +61,7 @@ implementation, the wrapper API around \pkg{DoubleML}, the \pkg{RichResult} return-type design, and three reproducible examples on the open Ontario, federal, and Toronto Police Service -records that the MRM framework integrates. Version~0.9.4 ships +records that the MRM framework integrates. Version~0.9.5 ships 275 textbook-derived callables in \code{morie.fn} across fourteen suites with full \proglang{R} parity, and adopts a licensing model under which the \proglang{Python} and @@ -74,10 +74,10 @@ double machine learning, sociolegal statistics, Hawkes process, psychometrics, Python} % --- morie version stamp ----------------------------------------------------- -% This paper documents \pkg{morie} v0.9.4 (released 2026-05-18). Specific +% This paper documents \pkg{morie} v0.9.5 (released 2026-05-18). Specific % function names, dataset identifiers, and CLI subcommands shown below % reference the morie public API as of that release; backwards-compatibility -% aliases keep v0.x code references resolving across the v0.4.x series. +% aliases keep v0.x code references resolving across the v0.x series. % ----------------------------------------------------------------------------- @@ -95,8 +95,8 @@ \section{Introduction} strong copyleft licence under which any modified version that is distributed, or offered to users over a network, must publish its source. The R companion -package \citep{Ruhela2026MorieR} is released under -GPL-2.0-only to match the R-ecosystem and CRAN convention. The +package \citep{Ruhela2026MorieR} is likewise released under +\code{AGPL-3.0-or-later}, matching the Python package. The package supports a unified estimator interface for the average treatment effect (ATE), the average treatment effect on the treated (ATT), the @@ -272,7 +272,7 @@ \section{Module taxonomy} \label{sec:taxonomy} The \pkg{morie} \proglang{Python} interface organises -functionality into eight thematic submodules. +functionality into eight thematic groups. \subsection{morie.causal} @@ -497,7 +497,7 @@ \section{Reproducible examples} \subsection{Example 2: replication of the Sprott--Doob chi-squared record} The second example replicates the published $\chi^{2}$ statistics -from \citet{SprottDoob2023} on the federal $\SIU$ aggregate +from \citet{SprottDoob2021} on the federal $\SIU$ aggregate record. \pkg{morie} does not ship a dedicated \code{mrm\_chi2\_doob} wrapper at present; the standard \code{scipy.stats.chi2\_contingency} routine reproduces every @@ -506,7 +506,7 @@ \subsection{Example 2: replication of the Sprott--Doob chi-squared record} \begin{Code} >>> import numpy as np >>> from scipy.stats import chi2_contingency ->>> # Sprott-Doob 2023 Table 3: Indigenous status x SIU placement +>>> # Sprott-Doob 2021 Table 3: Indigenous status x SIU placement >>> table3 = np.array([[196, 87], ... [1432, 754]]) >>> chi2, pval, dof, _ = chi2_contingency(table3, correction=False) @@ -531,8 +531,9 @@ \subsection{Example 3: Hawkes self-exciting process on TPS major crime} $O(n)$ recursion for the exponential kernel, and sub-quadratic truncated and sum-of-exponentials paths for the non-Markovian kernels, with a Numba-JIT path retained as a fallback. The -\code{compare\_hawkes\_kernels} routine fits all 8 (kernel $\times$ -baseline) combinations and ranks by AIC: +\code{compare\_hawkes\_kernels} routine fits every requested +(kernel $\times$ baseline) combination --- here the four +combinations of two kernels and two baselines --- and ranks by AIC: \begin{Code} >>> import pandas as pd @@ -736,7 +737,7 @@ \section{Conclusion} multi-domain toolkit for observational inference, with explicit support for the MRM framework as a sociolegal application target. The wrapper architecture around \pkg{DoubleML} offers a thin -function-style API; the eight thematic submodules provide a +function-style API; the eight thematic groups provide a unified namespace for the broader observational-inference, signal-processing, spatial, statistical-physics, and psychometric routines that complement the causal core. The MRM modules diff --git a/papers/morie-py-paper/refs.bib b/papers/morie-py-paper/refs.bib index 6577da1e6b..9b47041c3e 100644 --- a/papers/morie-py-paper/refs.bib +++ b/papers/morie-py-paper/refs.bib @@ -20,20 +20,6 @@ @techreport{SprottDoob2021 note = {Verified URL 2026-05-14; the headline 9.9\% federal Mandela ``torture'' figure is published here.} } -% SprottDoob2023 is kept as an alias key for backward-compat with existing -% \citet{SprottDoob2023} calls. The headline 9.9% figure is from the -% February 2021 report above; the year-2023 key reflects the date our -% papers were drafted, not the report's publication year. -@techreport{SprottDoob2023, - author = {Sprott, Jane B. and Doob, Anthony N.}, - title = {Solitary Confinement, Torture, and {C}anada's Structured Intervention Units}, - institution = {Centre for Criminology and Sociolegal Studies, University of Toronto}, - year = {2021}, - month = feb, - type = {Report}, - url = {https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf}, - note = {Verified URL 2026-05-14; alias of the Sprott \& Doob 2021 report.} -} @techreport{DoobSprott2020, author = {Doob, Anthony N. and Sprott, Jane B.}, @@ -276,7 +262,6 @@ @article{Hawkes1971 doi = {10.1093/biomet/58.1.83} } -doi = {10.1109/TIT.1981.1056305} @article{Ogata1981, author = {Ogata, Yosihiko}, title = {On {L}ewis' simulation method for point processes}, @@ -284,7 +269,8 @@ @article{Ogata1981 year = {1981}, volume = {27}, number = {1}, - pages = {23--31} + pages = {23--31}, + doi = {10.1109/TIT.1981.1056305} } @book{Daley2003, @@ -392,7 +378,7 @@ @misc{Ruhela2026MorieR title = {morie: Multi-domain Open Research and Inferential Estimation in {R}}, year = {2026}, doi = {10.5281/zenodo.20111233}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026MoriePy, @@ -400,7 +386,7 @@ @misc{Ruhela2026MoriePy title = {morie: Multi-domain Open Research and Inferential Estimation in {Python}}, year = {2026}, doi = {10.5281/zenodo.20096350}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026Hawkes, @@ -533,30 +519,28 @@ @inproceedings{Vaswani2017Attention doi = {10.48550/arXiv.1706.03762} } -doi = {10.48550/arXiv.2205.14135} - @inproceedings{Dao2022FlashAttention, author = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'{e}}, Christopher}, title = {{FlashAttention}: Fast and Memory-Efficient Exact Attention with {IO}-Awareness}, booktitle = {Advances in Neural Information Processing Systems 35 (NeurIPS 2022)}, - year = {2022} + year = {2022}, + doi = {10.48550/arXiv.2205.14135} } -doi = {10.48550/arXiv.2104.09864} @article{Su2021RoPE, author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng}, title = {{RoFormer}: Enhanced Transformer with Rotary Position Embedding}, journal = {arXiv preprint arXiv:2104.09864}, - year = {2021} + year = {2021}, + doi = {10.48550/arXiv.2104.09864} } -doi = {10.48550/arXiv.1910.07467} - @inproceedings{ZhangSennrich2019RMSNorm, author = {Zhang, Biao and Sennrich, Rico}, title = {Root Mean Square Layer Normalization}, booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS 2019)}, - year = {2019} + year = {2019}, + doi = {10.48550/arXiv.1910.07467} } @book{Fauzi2018Kernel, @@ -593,15 +577,14 @@ @book{HyndmanAthanasopoulos2018 address = {Melbourne, Australia} } -doi = {10.1002/9780470644560} - @book{Tsay2010FinancialTimeSeries, author = {Tsay, Ruey S.}, title = {Analysis of Financial Time Series}, edition = {3rd}, publisher = {Wiley}, year = {2010}, - address = {Hoboken, NJ} + address = {Hoboken, NJ}, + doi = {10.1002/9780470644560} } @article{Laniyonu2018CoffeeShops, diff --git a/papers/morie-r-paper/main.tex b/papers/morie-r-paper/main.tex index 2bc2037ce5..a37856c272 100644 --- a/papers/morie-r-paper/main.tex +++ b/papers/morie-r-paper/main.tex @@ -16,7 +16,7 @@ %% --- math macros --- \providecommand{\Rset}{\mathbb{R}} -\renewcommand{\E}{\mathbb{E}} +\providecommand{\E}{\mathbb{E}} \newcommand{\indep}{\perp\!\!\!\perp} \newcommand{\OTIS}{\textsf{OTIS}} \newcommand{\SIU}{\textsf{SIU}} @@ -55,9 +55,9 @@ crime (Hawkes processes, reaction--diffusion), psychometric classical-test-theory and item-response-theory estimators, and a suite of MRM modules for ingestion of Canadian open carceral data. -The package is released on the Comprehensive R Archive Network -(CRAN) and via r-universe at -\href{https://hadesllm.r-universe.dev/morie}{hadesllm.r-universe.dev/morie}. +The package is distributed via r-universe at +\href{https://hadesllm.r-universe.dev/morie}{hadesllm.r-universe.dev/morie} +and from source on GitHub; it is not on CRAN. This paper describes the architecture of the \proglang{R} implementation, the wrapper API around \pkg{DoubleML}, and three reproducible examples drawn from the open Ontario, federal, and @@ -70,10 +70,10 @@ double machine learning, sociolegal statistics, Hawkes process, psychometrics, R} % --- morie version stamp ----------------------------------------------------- -% This paper documents \pkg{morie} v0.6.1 (released 2026-05-13). Specific +% This paper documents \pkg{morie} v0.9.5 (released 2026-05-18). Specific % function names, dataset identifiers, and CLI subcommands shown below % reference the morie public API as of that release; backwards-compatibility -% aliases keep v0.x code references resolving across the v0.4.x series. +% aliases keep v0.x code references resolving across the v0.x series. % ----------------------------------------------------------------------------- @@ -144,7 +144,7 @@ \section{Introduction} around \pkg{DoubleML}. Section~\ref{sec:examples} works three reproducible examples on the Ontario, federal, and TPS data. Section~\ref{sec:mrm-empirical-callables} catalogues the -twelve empirical-workflow callables added in v0.1.15. +thirteen empirical-workflow callables added in v0.1.15. Section~\ref{sec:textbook-callables} catalogues the 275 textbook-derived callables that ship in \code{morie.fn} across fourteen suites in v0.2.1 and v0.3.0. @@ -156,14 +156,9 @@ \section{Getting started} \subsection{Installation} -The released \pkg{morie} package is available on CRAN: - -\begin{Code} -R> install.packages("morie") -\end{Code} - -\noindent Nightly Linux, macOS, and Windows binaries are built by -r-universe \citep{Ooms2021runiverse} from the GitHub source tree +The released \pkg{morie} package is not on CRAN; it is distributed +via r-universe. Nightly Linux, macOS, and Windows binaries are built +by r-universe \citep{Ooms2021runiverse} from the GitHub source tree and can be installed from the project's r-universe page: \begin{Code} @@ -434,7 +429,7 @@ \section{Wrapper architecture for DoubleML} \noindent The wrapper performs three additional services beyond the \pkg{DoubleML} class call. First, it harmonises the result -object to a \code{RichResult} that prints a multi-section +object to a \code{morie\_result} that prints a multi-section interpretation with verbose interpretation under a uniform format. Second, it audits learner-result quality through cross-validation $R^{2}$ and Brier-score diagnostics. Third, it surfaces both the @@ -499,7 +494,7 @@ \section{Reproducible examples} \subsection{Example 2: replication of the Sprott--Doob chi-squared record} The second example replicates the published $\chi^{2}$ statistics -from \citet{SprottDoob2023} on the federal $\SIU$ aggregate +from \citet{SprottDoob2021} on the federal $\SIU$ aggregate record. \pkg{morie} does not yet ship a dedicated \code{mrm\_chi2\_doob()} wrapper; base \proglang{R}'s \code{chisq.test()} reproduces every published statistic to @@ -542,7 +537,7 @@ \subsection{Example 3: Hawkes self-exciting process on TPS major crime} \begin{Code} R> refit <- mrm_tps_load_hawkes_refit( -+ system.file("extdata", "tps_hawkes_refit_v0.6.1.json", ++ system.file("extdata", "tps_hawkes_refit.json", + package = "morie")) R> subset(refit$summary, + crime == "Assault" & year == 2019) @@ -589,14 +584,14 @@ \section{Simulation study} \section[Empirical workflow callables]{Empirical workflow callables}\label{sec:mrm-empirical-callables} -The current release ships twelve new exported callables that compose +The current release ships thirteen new exported callables that compose into the empirical workflow used by the companion empirical paper \citep{Ruhela2026MorieEmpirical}. They are documented here for completeness; every function has full \proglang{R} + \proglang{Python} parity (the \proglang{Python} side is described in the morie \proglang{Python} paper, this volume). -\paragraph{OTIS suite.} Five callables operate on the Ontario +\paragraph{OTIS suite.} Five callables operate on the Offender Tracking Information System public release. \code{mrm_classify_mandela()} (introduced in v0.1.14) classifies placements under the United Nations Mandela Rules with three denominator conventions (row, diff --git a/papers/morie-r-paper/refs.bib b/papers/morie-r-paper/refs.bib index 6577da1e6b..9b47041c3e 100644 --- a/papers/morie-r-paper/refs.bib +++ b/papers/morie-r-paper/refs.bib @@ -20,20 +20,6 @@ @techreport{SprottDoob2021 note = {Verified URL 2026-05-14; the headline 9.9\% federal Mandela ``torture'' figure is published here.} } -% SprottDoob2023 is kept as an alias key for backward-compat with existing -% \citet{SprottDoob2023} calls. The headline 9.9% figure is from the -% February 2021 report above; the year-2023 key reflects the date our -% papers were drafted, not the report's publication year. -@techreport{SprottDoob2023, - author = {Sprott, Jane B. and Doob, Anthony N.}, - title = {Solitary Confinement, Torture, and {C}anada's Structured Intervention Units}, - institution = {Centre for Criminology and Sociolegal Studies, University of Toronto}, - year = {2021}, - month = feb, - type = {Report}, - url = {https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf}, - note = {Verified URL 2026-05-14; alias of the Sprott \& Doob 2021 report.} -} @techreport{DoobSprott2020, author = {Doob, Anthony N. and Sprott, Jane B.}, @@ -276,7 +262,6 @@ @article{Hawkes1971 doi = {10.1093/biomet/58.1.83} } -doi = {10.1109/TIT.1981.1056305} @article{Ogata1981, author = {Ogata, Yosihiko}, title = {On {L}ewis' simulation method for point processes}, @@ -284,7 +269,8 @@ @article{Ogata1981 year = {1981}, volume = {27}, number = {1}, - pages = {23--31} + pages = {23--31}, + doi = {10.1109/TIT.1981.1056305} } @book{Daley2003, @@ -392,7 +378,7 @@ @misc{Ruhela2026MorieR title = {morie: Multi-domain Open Research and Inferential Estimation in {R}}, year = {2026}, doi = {10.5281/zenodo.20111233}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026MoriePy, @@ -400,7 +386,7 @@ @misc{Ruhela2026MoriePy title = {morie: Multi-domain Open Research and Inferential Estimation in {Python}}, year = {2026}, doi = {10.5281/zenodo.20096350}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026Hawkes, @@ -533,30 +519,28 @@ @inproceedings{Vaswani2017Attention doi = {10.48550/arXiv.1706.03762} } -doi = {10.48550/arXiv.2205.14135} - @inproceedings{Dao2022FlashAttention, author = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'{e}}, Christopher}, title = {{FlashAttention}: Fast and Memory-Efficient Exact Attention with {IO}-Awareness}, booktitle = {Advances in Neural Information Processing Systems 35 (NeurIPS 2022)}, - year = {2022} + year = {2022}, + doi = {10.48550/arXiv.2205.14135} } -doi = {10.48550/arXiv.2104.09864} @article{Su2021RoPE, author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng}, title = {{RoFormer}: Enhanced Transformer with Rotary Position Embedding}, journal = {arXiv preprint arXiv:2104.09864}, - year = {2021} + year = {2021}, + doi = {10.48550/arXiv.2104.09864} } -doi = {10.48550/arXiv.1910.07467} - @inproceedings{ZhangSennrich2019RMSNorm, author = {Zhang, Biao and Sennrich, Rico}, title = {Root Mean Square Layer Normalization}, booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS 2019)}, - year = {2019} + year = {2019}, + doi = {10.48550/arXiv.1910.07467} } @book{Fauzi2018Kernel, @@ -593,15 +577,14 @@ @book{HyndmanAthanasopoulos2018 address = {Melbourne, Australia} } -doi = {10.1002/9780470644560} - @book{Tsay2010FinancialTimeSeries, author = {Tsay, Ruey S.}, title = {Analysis of Financial Time Series}, edition = {3rd}, publisher = {Wiley}, year = {2010}, - address = {Hoboken, NJ} + address = {Hoboken, NJ}, + doi = {10.1002/9780470644560} } @article{Laniyonu2018CoffeeShops, diff --git a/papers/mrm-formulations-paper/main.tex b/papers/mrm-formulations-paper/main.tex index ff46f3fc12..1e88d20c17 100644 --- a/papers/mrm-formulations-paper/main.tex +++ b/papers/mrm-formulations-paper/main.tex @@ -7,7 +7,7 @@ \usepackage{lmodern} \usepackage{microtype} \usepackage{orcidlink} -\newcommand{\R}{\mathbb{R}} +\providecommand{\R}{\mathbb{R}} %% --- overflow tolerance --- \setlength{\emergencystretch}{4em} @@ -53,11 +53,11 @@ multi-source and under-using available estimators. MRM addresses this gap by coupling a ten-estimator per-individual causal ensemble (IPW, AIPW, IRM-DML, PSM, PLR, and -AIPW-SuperLearner variants) with a $\chi^{2}$ family for +PLR-SuperLearner variants) with a $\chi^{2}$ family for aggregate contingency tables, a Mandela Rules classifier operating at both federal and provincial level, and a Hawkes self-exciting process for spatio-temporal modelling. -Implementation lives in the open-source MORIE toolkit +Implementation lives in the open-source \pkg{morie} toolkit \citep{Ruhela2026MorieR, Ruhela2026MoriePy}. Empirical contribution: a duration-only Mandela-prolonged proxy applied to the Ontario restrictive-confinement microdata rises from @@ -77,10 +77,10 @@ carceral data, OTIS, Special Investigations Unit, Hawkes process, Mandela Rules, MRM framework, sociolegal statistics, R, Python} % --- morie version stamp ----------------------------------------------------- -% This paper documents \pkg{morie} v0.6.1 (released 2026-05-13). Specific +% This paper documents \pkg{morie} v0.9.5 (released 2026-05-18). Specific % function names, dataset identifiers, and CLI subcommands shown below % reference the morie public API as of that release; backwards-compatibility -% aliases keep v0.x code references resolving across the v0.4.x series. +% aliases keep v0.x code references resolving across the v0.x series. % ----------------------------------------------------------------------------- @@ -142,7 +142,7 @@ \section{Introduction} estimator can be applied. The result is a literature in which methodologically careful single-source work coexists with multi-source descriptive work that under-uses available estimators -\citep{Doob2023, ZorroMedina2023JMP, SprottDoob2023}. +\citep{Doob2023, ZorroMedina2023JMP, SprottDoob2021}. This paper introduces the MRM framework, a multi-source statistical foundation that brings these data streams @@ -157,7 +157,7 @@ \section{Introduction} lineage and extends it to a Canadian carceral setting. The framework is implemented as a curated set of MRM modules in -MORIE\footnote{\textit{morie} is pronounced ``MOY-rye'', +\pkg{morie}\footnote{\textit{morie} is pronounced ``MOY-rye'', like the Greek \textit{Moirai}, the Fates.} \citep{Ruhela2026MorieR, Ruhela2026MoriePy}, a dual-language (\proglang{R} and \proglang{Python}) toolkit. The \proglang{R} @@ -180,7 +180,7 @@ \section{Introduction} at both the federal and provincial level. The aggregate family is exercised empirically to reproduce every published $\chi^{2}$ statistic from the Sprott--Doob--Iftene record to within $0.01$ -\citep{SprottDoob2023, SprottDoobIftene2021}. Third, the framework is +\citep{SprottDoob2021, SprottDoobIftene2021}. Third, the framework is extended into the spatio-temporal domain through a self-exciting Hawkes process specification (Section~\ref{sec:hawkes}) applied to Toronto Police Service open data. @@ -190,7 +190,7 @@ \section{Introduction} ``torture''-classified proportion rises from $12.5\%$ in fiscal 2023 to $20.6\%$ in fiscal 2025, exceeding by $10.7$ percentage points at peak the federal $\SIU$ rate of $9.9\%$ that -\citet{SprottDoob2023} report for fiscal 2019--2020. Under the +\citet{SprottDoob2021} report for fiscal 2019--2020. Under the broader restrictive-confinement classification advanced in Section~\ref{sec:mandela}, the 2025 rate reaches $40.9\%$. @@ -205,7 +205,7 @@ \section{Introduction} which Section~\ref{sec:mechanism} maps the deterrence / routine-activities / certainty mechanism categorisation onto the now-defined estimator family. -Section~\ref{sec:morie} summarises the implementation in MORIE, +Section~\ref{sec:morie} summarises the implementation in \pkg{morie}, and Section~\ref{sec:worked-example} walks through a complete reproducible analysis end-to-end. Section~\ref{sec:conclusion} concludes. @@ -220,7 +220,7 @@ \subsection{Notation} points (fiscal quarters for OTIS, calendar days for TPS, fiscal years for the aggregate sources). Each unit $i$ at time $t$ carries covariates $X_{it} \in \Rset^{p}$, a binary treatment -indicator $D_{it} \in \{0,1\}$, and an outcome $Y_{it} \in \R$. +indicator $D_{it} \in \{0,1\}$, and an outcome $Y_{it} \in \Rset$. The treatment indicator denotes the presence of the analytical treatment under study, such as a Mandela-classified placement, a $\SIU$ Section~34 trigger, or a TPS major-crime category. @@ -249,6 +249,7 @@ \subsection{The five sources} \caption{The five Canadian carceral, police, and oversight data sources integrated by MRM.} \label{tab:sources} +\resizebox{\linewidth}{!}{% \begin{tabular}{lllll} \toprule Source & Producer & Unit & Period & Access \\ @@ -259,7 +260,8 @@ \subsection{The five sources} $\TPS$ & Toronto Police & event--day & 2014--present & open \\ $\CCRSO$ & Public Safety CA & annual aggregate & 2007--present & open \\ \bottomrule -\end{tabular} +\end{tabular}% +} \end{table} \OTIS{} A01-RCDD is the Ontario Ministry of the Solicitor General (formerly the @@ -272,7 +274,7 @@ \subsection{The five sources} $\SIU$~$\IAP$ comprises the four quarterly reports of the federal Structured Intervention Unit Implementation Advisory Panel (2019--2022) and the parallel Sprott--Doob--Iftene academic -replication corpus \citep{SprottDoob2023, SprottDoobIftene2021}, which +replication corpus \citep{SprottDoob2021, SprottDoobIftene2021}, which includes contingency tables that report the federal Mandela rate of $9.9\%$ used as the reference point in Section~\ref{sec:mandela}. @@ -562,7 +564,7 @@ \section{Mandela Rules classifier and the headline finding} \subsection{Federal operationalisation} At the federal level, the Sprott--Doob--Iftene record -\citep{SprottDoob2023} classifies a Section~34 Structured +\citep{SprottDoob2021} classifies a Section~34 Structured Intervention Unit placement as Mandela-torture when the recorded placement duration exceeds fifteen days and one or more of the ``meaningful contact'' criteria is unmet. Their published rate for @@ -587,7 +589,7 @@ \subsection{Cross-jurisdiction comparison: an apples-to-apples \label{sec:mandela-comparison} Direct comparison of the federal $9.9\%$ -\citep{SprottDoob2023} with a single provincial rate is +\citep{SprottDoob2021} with a single provincial rate is methodologically delicate for two reasons: (i) the federal rate is computed under both Rule 43 (duration) and Rule 44 (meaningful contact), whereas $\OTIS{}$ public release does not expose @@ -626,10 +628,11 @@ \subsection{Cross-jurisdiction comparison: an apples-to-apples includes only placements with any alert (lower bound on a strict Rule 43 $\cap$ Rule 44), $C$ includes only placements without any alert (upper bound on the same). The federal cell -shows the Sprott--Doob--Iftene 2021/2023 published rates; an +shows the Sprott--Doob--Iftene 2021 published rates; an exact federal duration-only sub-rate is not separately published in that series, so we bound it below by the published -$10\%$ torture rate (Rule 43 $\cap$ Rule 44) and report the +$9.9\%$ Rule 43 $\cap$ Rule 44 torture rate (a strict subset of +Rule 43, hence a valid lower bound on the Rule 43-only rate) and report the $28\%$ contact-only rate (Rule 44 alone) as the analogue of our broader-RC column.} \label{tab:mandela} @@ -639,7 +642,7 @@ \subsection{Cross-jurisdiction comparison: an apples-to-apples & Rule 43 only & Rule 43 $\cap$ & Rule 43 $\cap$ & RC & \\ Jurisdiction (period) & ($A$) & alerted ($B$) & no-alert ($C$) & broader & Source \\ \midrule -Federal $\SIU$ 2019--2020 & $\geq 10\%$ & --- & --- & $\approx 28\%$ & $9.9\%$ \citep{SprottDoob2023} \\ +Federal $\SIU$ 2019--2020 & $\geq 9.9\%$ & --- & --- & $\approx 28\%$ & \citep{SprottDoob2021} \\ Provincial Ontario 2023 & $12.5\%$ & $0.072\%$ & $0.003\%$ & $31.5\%$ & MRM \\ Provincial Ontario 2024 & $16.5\%$ & $0.463\%$ & $0.194\%$ & $36.0\%$ & MRM \\ Provincial Ontario 2025 & $20.6\%$ & $0.905\%$ & $0.387\%$ & $40.9\%$ & MRM \\ @@ -659,14 +662,15 @@ \subsection{Cross-jurisdiction comparison: an apples-to-apples imposes only the duration condition). Pending a c11-level linkage to b01 alert flags (queued for the next release of \code{mrm\_classify\_mandela()}), we report this spectrum rather -than a single number. On the federal side, the Sprott--Doob 2023 +than a single number. On the federal side, the Sprott--Doob 2021 single headline of $9.9\%$ can be bracketed against the underlying Sprott--Doob--Iftene 2021 cohort ($N=1{,}979$): the federal -Rule 43-only sub-rate is bounded below by $10\%$ (since the -published $9.9\%$ Rule 43 $\cap$ Rule 44 torture rate is a strict -subset of Rule 43), and the federal Rule 44-only sub-rate is +Rule 43-only sub-rate is bounded below by $9.9\%$ (the published +$9.9\%$ Rule 43 $\cap$ Rule 44 torture rate counts a strict subset +of Rule 43 placements, so the Rule 43-only rate can only be +larger), and the federal Rule 44-only sub-rate is $28\%$. The provincial 2025 Rule 43 cell ($20.6\%$) sits above -the federal Rule 43 lower bound by approximately $+10.6$ +the federal Rule 43 lower bound by approximately $+10.7$ percentage points. The substantive implication for the \emph{Jahn}-settlement oversight framework is that the published duration-only metrics under-state the structural problem on the @@ -762,8 +766,8 @@ \subsection{Implementation} branching-ratio diagnostic for stationarity \citep{Ruhela2026Hawkes}. Since \pkg{morie}~v0.9.1 the likelihood evaluations run on a shared compiled C++ core used identically by the \proglang{Python} and -\proglang{R} sides, so the two language implementations cannot -diverge; the non-Markovian kernels are evaluated either by exact +\proglang{R} sides, so the two language implementations evaluate an +identical likelihood; the non-Markovian kernels are evaluated either by exact sub-quadratic truncation of the excitation sum or by a sum-of-exponentials engine, rather than the dense $O(n^2)$ sum. Goodness-of-fit is assessed through the time-rescaling theorem of @@ -813,11 +817,11 @@ \section{Mechanism categorisation} into treatment, the matched comparison estimates the effect under a common conditional probability of treatment given covariates. -\section{Implementation in MORIE} +\section{Implementation in morie} \label{sec:morie} The framework is implemented as a curated set of MRM modules in -MORIE, a dual-language (\proglang{R} and \proglang{Python}) +\pkg{morie}, a dual-language (\proglang{R} and \proglang{Python}) toolkit. The \proglang{R} and \proglang{Python} surfaces are both released under \code{AGPL-3.0-or-later}. The companion software papers @@ -850,12 +854,12 @@ \section{Worked example: reproducing the headline from public data} representative output blocks. Both the \proglang{R} and \proglang{Python} surfaces are shown side by side; users may adopt whichever they prefer. The example uses functions -exported by MORIE v0.6.1 +exported by morie v0.9.5 \citep{Ruhela2026MorieR, Ruhela2026MoriePy}. \subsection{Loading the data} -In \proglang{R}, OTIS A01-RCDD is reached either through MORIE's +In \proglang{R}, OTIS A01-RCDD is reached either through \pkg{morie}'s dataset catalog (which resolves to the maintained CKAN endpoint) or from a local CSV. Either path returns the placement-record data frame whose columns @@ -1046,7 +1050,7 @@ \subsection{Per-individual ATE estimation} (Eq.~\ref{eq:chi-sq} formulation in Section~\ref{sec:aggregate}). For example, to replicate the SIU placement $\times$ Indigenous-status cross-tabulation -reported in \citet{SprottDoob2023}, one loads the published +reported in \citet{SprottDoob2021}, one loads the published table from the package's data directory and runs: \begin{CodeChunk} @@ -1077,7 +1081,7 @@ \subsection{Hawkes self-excitation on TPS events} The Hawkes specifications of Section~\ref{sec:hawkes} are fit offline against the maintained Hawkes-refit manifest distributed -with MORIE. Fitted-parameter retrieval is exposed as +with \pkg{morie}. Fitted-parameter retrieval is exposed as \code{mrm\_tps\_load\_hawkes\_refit()}, which returns the per- neighbourhood power-law and exponential-kernel fits with their likelihoods, AIC/BIC scores, and time-rescaling-test statistics: @@ -1085,7 +1089,7 @@ \subsection{Hawkes self-excitation on TPS events} \begin{CodeChunk} \begin{CodeInput} R> refit <- mrm_tps_load_hawkes_refit( -+ system.file("extdata", "tps_hawkes_refit_v0.6.1.json", ++ system.file("extdata", "tps_hawkes_refit.json", + package = "morie") + ) R> subset(refit$summary, @@ -1142,7 +1146,7 @@ \subsection{Reproducibility} \section{Group-disparity auditing} \label{sec:fairness} -MORIE's accountability mission extends naturally from estimating +\pkg{morie}'s accountability mission extends naturally from estimating intervention effects to \emph{auditing} the algorithmic systems that allocate carceral and police attention. Version~0.8.0 adds a disparity-audit subsystem that measures whether a risk-assessment, @@ -1154,7 +1158,7 @@ \subsection{Disparity metrics} For a binary system decision $\hat{y}\in\{0,1\}$ and a protected attribute $g$, write $r_g = P(\hat{y}=1\mid g)$ for the favourable-outcome rate of group $g$ and let $p$ denote the privileged -reference group. MORIE computes the Disparate Impact Ratio +reference group. \pkg{morie} computes the Disparate Impact Ratio $\text{DIR} = r_g / r_p$ (the EEOC four-fifths rule:\ $\text{DIR}<0.8$ flags adverse impact) and the Demographic Parity Gap $\Delta_{\text{parity}} = r_g - r_p$. When ground truth is available, @@ -1169,7 +1173,7 @@ \subsection{Disparity metrics} \subsection{Predictive-policing calibration audit} Generalising the Strategic Subjects List analysis of -\citet{Lacherade2021PredPolChicago}, MORIE ranks geographic areas by +\citet{Lacherade2021PredPolChicago}, \pkg{morie} ranks geographic areas by the risk an algorithm \emph{predicts} and by their \emph{realised} outcome rate, then tests whether the rank disagreement tracks the areas' demographic composition. An area ranked far more dangerous @@ -1185,7 +1189,7 @@ \subsection{Predictive-policing calibration audit} \subsection{Explainability} -For model-level audits, MORIE reimplements the explainer suite of the +For model-level audits, \pkg{morie} reimplements the explainer suite of the COMPAS case study in \citet{Biecek2021XAIStories}: permutation importance, partial dependence, accumulated local effects, ceteris paribus, and Shapley attributions. These surface \emph{which} @@ -1231,7 +1235,7 @@ \section*{Acknowledgements} Sociolegal Studies, University of Toronto, for the methodological lineage adopted by the framework's identification strategy. The author thanks Jeroen Ooms for maintaining the r-universe -infrastructure on which the MORIE \proglang{R} binaries are built, +infrastructure on which the \pkg{morie} \proglang{R} binaries are built, and the DoubleML team (Philipp Bach, Malte~S.\ Kurz, Victor Chernozhukov, Martin Spindler, Sven Klaassen) for the BSD-3-Clause double-machine-learning library on which the framework's IRM and diff --git a/papers/mrm-formulations-paper/refs.bib b/papers/mrm-formulations-paper/refs.bib index 4ecb295d6a..3cc0916b37 100644 --- a/papers/mrm-formulations-paper/refs.bib +++ b/papers/mrm-formulations-paper/refs.bib @@ -20,20 +20,6 @@ @techreport{SprottDoob2021 note = {The headline figure of $9.9\%$ of SIU person-stays meeting both Rule~43 (duration) and Rule~44 (lack of meaningful contact) is published in this report.} } -% SprottDoob2023 is an alias for SprottDoob2021 kept here for backward-compat with -% existing \citet{SprottDoob2023} calls in the manuscripts. The headline 9.9% figure -% is from the February 2021 report above; the year-2023 key reflects the date our -% papers were drafted, not the report's publication year. -@techreport{SprottDoob2023, - author = {Sprott, Jane B. and Doob, Anthony N.}, - title = {Solitary Confinement, Torture, and {C}anada's Structured Intervention Units}, - institution = {Centre for Criminology and Sociolegal Studies, University of Toronto}, - year = {2021}, - month = feb, - type = {Report}, - url = {https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf}, - note = {Verified URL 2026-05-14; the 9.9\% federal Mandela ``torture'' figure is published here.} -} @techreport{DoobSprott2020, author = {Doob, Anthony N. and Sprott, Jane B.}, @@ -277,7 +263,6 @@ @article{Hawkes1971 doi = {10.1093/biomet/58.1.83} } -doi = {10.1109/TIT.1981.1056305} @article{Ogata1981, author = {Ogata, Yosihiko}, title = {On {L}ewis' simulation method for point processes}, @@ -285,7 +270,8 @@ @article{Ogata1981 year = {1981}, volume = {27}, number = {1}, - pages = {23--31} + pages = {23--31}, + doi = {10.1109/TIT.1981.1056305} } @book{Daley2003, @@ -364,7 +350,7 @@ @misc{Ruhela2026MorieR title = {morie: Multi-domain Open Research and Inferential Estimation in {R}}, year = {2026}, doi = {10.5281/zenodo.20111233}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026MoriePy, @@ -372,7 +358,7 @@ @misc{Ruhela2026MoriePy title = {morie: Multi-domain Open Research and Inferential Estimation in {Python}}, year = {2026}, doi = {10.5281/zenodo.20096350}, - version = {0.9.4} + version = {0.9.5} } @misc{Ruhela2026Hawkes, diff --git a/pyproject.toml b/pyproject.toml index 950df0b57c..42c354391c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "morie" -version = "0.9.4" +version = "0.9.5.2" description = "Multi-domain Open Research and Inferential Estimation. Multi-domain scientific computing toolkit hosting the MRM framework for Canadian carceral, police, and oversight data, with general-purpose causal inference, signal processing, cryptography, spatial statistics, statistical physics, and psychometrics modules. (Renamed from MOIRAIS in v0.1.3.)" authors = [ { name="Vansh Singh Ruhela", email="hadesllm@proton.me" } diff --git a/r-package/morie/.Rbuildignore b/r-package/morie/.Rbuildignore index 166c0b12de..13a4721f24 100644 --- a/r-package/morie/.Rbuildignore +++ b/r-package/morie/.Rbuildignore @@ -16,3 +16,9 @@ ^\.RData\$ ^PI_HANDOFF\.md\$ ^src/\.clangd$ +^src/Makevars$ +^src/Makevars\.win$ +^\.lintr$ +^scripts$ +^_pkgdown\.yml$ +^docs$ diff --git a/r-package/morie/.github/CONTRIBUTING.md b/r-package/morie/.github/CONTRIBUTING.md new file mode 100644 index 0000000000..9e24bc3bff --- /dev/null +++ b/r-package/morie/.github/CONTRIBUTING.md @@ -0,0 +1,89 @@ +# Contributing to morie + +Thanks for your interest in contributing! morie is a dual-language +(Python + R) scientific computing toolkit; contributions of all sizes +are welcome. + +This document follows the Rust-ecosystem contribution model: short, +permissive, and standard GitHub PR workflow. + +## License of contributions + +By submitting a contribution you agree that it is licensed under +**AGPL-3.0-or-later** — the same license that covers both the Python +and R packages. + +Contributions to the `kernel-module/` and `daemon/` subtrees are +licensed under **GPL-2.0-only** (Linux kernel ABI requirement); flag +in your PR if you are touching those subtrees. + +No CLA, no DCO sign-off required. Just opening the PR is the +agreement. + +## How to contribute + +1. **Open an issue first** if your change is non-trivial — it saves + re-work later. For tiny fixes (typos, doc improvements, obvious + bugs) feel free to skip straight to a PR. +2. **Fork + branch + PR.** Standard GitHub flow. +3. **Test locally** before opening the PR: + - Python: `pytest -q tests/` + - R: `R CMD check --as-cran r-package/morie` + - If you touched the LaTeX papers under `papers/`, all five rebuild + with `xelatex main.tex && bibtex main && xelatex main.tex && + xelatex main.tex`. +4. **Update docs** if your change is user-visible: + - Python API change: docstring + `tests/`. + - R API change: roxygen2 docstring + regenerate `.Rd` with + `devtools::document()` + commit the regenerated files. + - User-facing behaviour: `r-package/morie/NEWS.md` and/or + `ROADMAP.md`. + +## Commit messages + +Keep them readable. Imperative mood ("add X", "fix Y", "refactor Z"). +Wrap the body at ~72 chars. Explain *why* the change is needed when +the diff itself doesn't make it obvious. + +If your PR is logically more than one change, split it into multiple +commits — each one should compile and pass tests on its own. + +## Coding style + +- **Python**: PEP 8 with `ruff format` (black-compatible). Type hints + on public-API functions. Numpydoc-style docstrings. +- **R**: 2-space indentation, snake_case, roxygen2 docstrings on every + exported function. Every R file under `r-package/morie/R/` carries + an SPDX header `# SPDX-License-Identifier: AGPL-3.0-or-later`. +- **LaTeX (papers)**: JSS document class; long filenames in + `\footnote{...}` so the prose can break cleanly. + +## Code of conduct + +Please be respectful. We follow the +[Contributor Covenant v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/); +the full text is in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md). +Reports go to . + +## Security disclosures + +**Do not file security issues in the public tracker.** +See [`.github/SECURITY.md`](.github/SECURITY.md) for the disclosure +address and embargo policy. + +## Maintainer + +Vansh Singh Ruhela +([@rootcoder007](https://github.com/rootcoder007), +, +ORCID [0009-0004-1750-3592](https://orcid.org/0009-0004-1750-3592)). + +## A note on the AGPL-3.0 license + +morie is licensed under the GNU Affero General Public License, version +3.0 or later. It is a strong copyleft license: anyone who distributes +a modified morie — or offers a modified morie to users over a network +— must publish their source under the same license. This keeps every +improvement to morie visible to the public and prevents closed-source +forks. See [`LICENSING.md`](LICENSING.md) for the per-component +breakdown, including the optional `GPL-2.0-only` kernel adjuncts. diff --git a/r-package/morie/.github/workflows/r-cmd-check.yml b/r-package/morie/.github/workflows/r-cmd-check.yml new file mode 100644 index 0000000000..57647129a2 --- /dev/null +++ b/r-package/morie/.github/workflows/r-cmd-check.yml @@ -0,0 +1,22 @@ +name: R-CMD-check (subdir marker) + +# This workflow exists so rOpenSci's pkgcheck detects CI inside the +# package sub-directory ("r-package/morie/"). The actual matrix R CMD +# check across (ubuntu, macos, windows) x (release, devel, oldrel) is +# configured at the repository root in `.github/workflows/r-cmd-check.yml` +# and runs on every push/PR. This file is dispatch-only so it doesn't +# duplicate that run; pkgcheck only needs the file to exist. + +on: + workflow_dispatch: + +permissions: + contents: read + +jobs: + marker: + runs-on: ubuntu-latest + steps: + - name: Note + run: | + echo "Matrix R-CMD-check is at ../../.github/workflows/r-cmd-check.yml" diff --git a/r-package/morie/.gitignore b/r-package/morie/.gitignore new file mode 100644 index 0000000000..d7514e8715 --- /dev/null +++ b/r-package/morie/.gitignore @@ -0,0 +1,13 @@ +# Build artifacts -- regenerated by `R CMD build` / `R CMD INSTALL`. +inst/doc/ +src/*.o +src/*.so +src/Makevars +src/symbols.rds +/morie.Rcheck/ +/morie_*.tar.gz +..Rcheck/ +src/*.gcda + +src/*.gcno +src/*.gcov diff --git a/r-package/morie/.lintr b/r-package/morie/.lintr new file mode 100644 index 0000000000..53a88579dd --- /dev/null +++ b/r-package/morie/.lintr @@ -0,0 +1,28 @@ +linters: linters_with_defaults( + line_length_linter = line_length_linter(120L), + object_name_linter = NULL, + commented_code_linter = NULL, + object_length_linter = object_length_linter(length = 50L), + indentation_linter = NULL, + object_usage_linter = NULL, + brace_linter = NULL, + commas_linter = NULL + ) +exclusions: list( + "data-raw" = Inf, + "tests/testthat" = list( + undesirable_function_linter = Inf, + library_call_linter = Inf + ), + "vignettes" = list( + library_call_linter = Inf, + line_length_linter = Inf + ), + "R/RcppExports.R" = Inf, + "R/dataset_catalog.R" = list(line_length_linter = Inf), + "R/study_core.R" = list(line_length_linter = Inf), + "R/study_reporting.R" = list(line_length_linter = Inf), + "R/rndsr.R" = list(line_length_linter = Inf), + "R/rfens.R" = list(line_length_linter = Inf) + ) +encoding: "UTF-8" diff --git a/r-package/morie/DESCRIPTION b/r-package/morie/DESCRIPTION index 1060495b71..0d1767fcbd 100644 --- a/r-package/morie/DESCRIPTION +++ b/r-package/morie/DESCRIPTION @@ -1,7 +1,7 @@ Package: morie Type: Package Title: Multi-Domain Open Research and Inferential Estimation -Version: 0.9.4 +Version: 0.9.5.2 Authors@R: c( person( given = "Vansh Singh", @@ -43,6 +43,7 @@ Imports: utils, Rcpp LinkingTo: Rcpp +SystemRequirements: libcurl: libcurl-devel (rpm) or libcurl4-openssl-dev (deb) Suggests: digest, openssl, @@ -51,7 +52,12 @@ Suggests: MatchIt, DBI, RSQLite, + duckdb, + RPostgres, jsonlite, + httr2, + xml2, + rvest, data.table, readxl, pracma, @@ -65,6 +71,7 @@ Suggests: caret, e1071, glmnet, + elasticnet, pROC, randomForest, rpart, @@ -88,6 +95,7 @@ Suggests: eegkit, R.matlab, testthat (>= 3.0.0), + withr, knitr, rmarkdown VignetteBuilder: knitr @@ -103,7 +111,6 @@ Collate: 'aaa_helpers_horowitz.R' 'aaa_helpers_llm_arch.R' 'aaa_helpers_montesinos.R' - 'aaa_helpers_samples.R' 'aaa_helpers_time_series_advanced.R' 'accgp.R' 'agset.R' @@ -143,6 +150,7 @@ Collate: 'ctmed.R' 'ctrlc.R' 'data.R' + 'data_access.R' 'database.R' 'dataset_catalog.R' 'dataset_profile.R' @@ -314,7 +322,6 @@ Collate: 'okrig.R' 'optcl.R' 'ordct.R' - 'ordlt.R' 'ordlt_jonckheere.R' 'paths.R' 'pcadm.R' @@ -374,6 +381,7 @@ Collate: 'sglm.R' 'sgnpw.R' 'signal.R' + 'siu.R' 'smixd.R' 'sobls.R' 'spblk.R' @@ -425,3 +433,4 @@ Collate: 'wsrpw.R' 'xavir.R' 'xgbst.R' + 'zzz_x_helpers.R' diff --git a/r-package/morie/NAMESPACE b/r-package/morie/NAMESPACE index 4330e4a047..d30a7b8819 100644 --- a/r-package/morie/NAMESPACE +++ b/r-package/morie/NAMESPACE @@ -1,86 +1,19 @@ -# Generated by combined roxygen pass + regex sweep (v0.3.0 unify). -useDynLib(morie, .registration = TRUE) -importFrom(Rcpp, sourceCpp) -S3method(print, morie_hawkes_fit) -export(morie_fast_available) -export(morie_hawkes_fit) +# Generated by roxygen2: do not edit by hand + +S3method(print,morie_hawkes_fit) export(agset) export(algnm) -export(aniso) -export(anova_one_way) -export(arch_in_mean) -export(ask_percy) -export(attnq_scaled_dot_product_attention) -export(audit_public_outputs) -export(bayes_cpi_genomic) -export(bayes_ridge_gibbs) -export(bayesian_lasso_full) -export(bayesian_ridge_regression) -export(bkprp_backpropagation) -export(bnfwd_batch_norm_forward) -export(bootstrap_sample) export(brdgr) -export(build_outputs_manifest) -export(build_prompt) export(buttbp) export(buttbs) export(butthp) export(buttlp) export(bysid) -export(calculate_ebac) -export(calculate_ipw_weights) -export(calibration_weights) -export(canonicalize_cpads_data) -export(chi_square_test) -export(cluster_sample) export(cndrc) -export(cnn1d_conv1d_forward) -export(cnn2d_conv2d_forward) -export(cnn_genomic) -export(cohens_d) -export(coherence) export(cokrg) -export(compare_nested_logistic_models) -export(compute_design_weights) -export(concordance_incomplete) -export(confusion_matrix_metrics) -export(contingency_coefficient) -export(control_comparison) -export(control_median_test) -export(cpads_contract) -export(cramers_v) export(csphr) -export(dbscan_clustering) -export(dcc_multivariate_garch) -export(decision_tree_split) -export(deep_learning_genomic) -export(default_synthetic_name_map) -export(default_workflow_map) -export(design_effect) -export(diffu_diffusion_forward) -export(diffu_heat_diffusion) export(dimrd) -export(drpfw_dropout_forward) export(dwnmn) -export(e_value) -export(effective_sample_size) -export(eg_coint) -export(egarch_model) -export(estimate_aipw) -export(estimate_atc) -export(estimate_ate) -export(estimate_att) -export(estimate_cate) -export(estimate_g_computation) -export(estimate_gate) -export(estimate_irm) -export(estimate_late) -export(estimate_propensity_scores) -export(eta_squared) -export(ewma_volatility) -export(find_project_root) -export(fisher_exact_test) -export(fwpas_forward_pass_dense) export(fzbrd) export(fzcvm) export(fzedg) @@ -96,118 +29,455 @@ export(fzqnt) export(fzsgn) export(fzsrv) export(fzwlc) -export(ganls_gan_loss) -export(garch_fit) -export(gblup_full) -export(generate_synthetic_data) -export(genomic_cross_validation) -export(ghosal_adaptation) -export(ghosal_bernstein_von_mises) -export(ghosal_contraction_rate) -export(ghosal_dirichlet_posterior) -export(ghosal_dpmixture_density) -export(ghosal_empirical_bayes) -export(ghosal_gp_matern) -export(ghosal_gp_squared_exponential) -export(ghosal_hierarchical_bayes) -export(ghosal_log_density) -export(ghosal_moment_matching) -export(ghosal_neutral_right) -export(ghosal_np_classification) -export(ghosal_np_regression) -export(ghosal_np_testing) -export(ghosal_posterior_consistency) -export(ghosal_sieve_prior) -export(ghosal_stick_breaking_trunc) -export(ghosal_survival_beta_process) -export(ghosal_wavelet_prior) -export(gradient_boosting_ensemble) -export(gradient_boosting_genomic) -export(gradient_descent_vanilla) -export(grid_search_cv) -export(grm_vanraden) -export(grucl_gru_cell) export(gwreg) -export(gxe_interaction_model) -export(hedges_g) -export(heinz_he_initialization) export(hfd) -export(hurst_r) export(idlpt) export(indkr) -export(infer_measurement_level) -export(inspect_output) export(irtsp) -export(is_over_legal_limit) -export(jackknife_estimate) -export(johansen_cointegration) -export(kalman_filter) -export(kendall_tau) -export(kendall_tau_partial) -export(kmeans_clustering) -export(kruskal_wallis_test) -export(ksr01_kosorok_empirical_process) -export(ksr02_kosorok_donsker_class) -export(ksr03_kosorok_glivenko_cantelli) -export(ksr04_kosorok_vc_dimension) -export(ksr05_kosorok_bracketing_number) -export(ksr06_kosorok_maximal_inequality) -export(ksr07_kosorok_bootstrap_empirical) -export(ksr08_kosorok_multiplier_bootstrap) -export(ksr09_kosorok_z_estimator) -export(ksr10_kosorok_m_estimator) -export(ksr11_kosorok_efficient_score) -export(ksr12_kosorok_information_bound) -export(ksr13_kosorok_tangent_space) -export(ksr14_kosorok_profile_likelihood) -export(ksr15_kosorok_one_step_estimator) -export(ksr16_kosorok_influence_function) -export(ksr17_kosorok_counting_process) -export(ksr18_kosorok_nelson_aalen) -export(ksr19_kosorok_cox_partial_likelihood) -export(ksr20_kosorok_censoring_survival) -export(learning_curve) -export(levene_test) -export(linear_regression_ols) -export(list_morie_modules) -export(load_cpads_data) -export(lstmc_lstm_cell) -export(mann_whitney_test) -export(marker_variance) export(mdspl) export(mdvtr) -export(mhatf_multi_head_attention_full) -export(midas_regression) export(midranks) -export(mini_batch_gradient) export(mnpbt) +export(morie_agenda_setter_power) +export(morie_aniso) +export(morie_anisotropy_test) +export(morie_anova_one_way) +export(morie_antithetic_variates) +export(morie_arch_in_mean) +export(morie_ask_percy) +export(morie_attnq_scaled_dot_product_attention) +export(morie_audit_public_outputs) +export(morie_backpropagation) +export(morie_batch_norm_forward) +export(morie_bayes_cpi_genomic) +export(morie_bayes_ridge_gibbs) +export(morie_bayesian_ideal_points) +export(morie_bayesian_lasso_full) +export(morie_bayesian_ridge_regression) +export(morie_bkprp_backpropagation) +export(morie_bnfwd_batch_norm_forward) +export(morie_bootstrap_ci) +export(morie_bootstrap_sample) +export(morie_bridge_observations) +export(morie_build_outputs_manifest) +export(morie_build_prompt) export(morie_builtin_db) +export(morie_cache_clear) +export(morie_cache_dir) export(morie_cache_file) export(morie_cache_list) export(morie_cache_load) export(morie_cache_store) +export(morie_calculate_ebac) +export(morie_calculate_ipw_weights) +export(morie_calibration_weights) +export(morie_canonicalize_cpads_data) export(morie_check_plugin_license) +export(morie_chi_square_test) +export(morie_ckan_search) +export(morie_cluster_sample) +export(morie_cnn1d_conv1d_forward) +export(morie_cnn2d_conv2d_forward) +export(morie_cnn_genomic) +export(morie_cohens_d) +export(morie_coherence) +export(morie_cokriging) +export(morie_compare_nested_logistic_models) +export(morie_compute_design_weights) +export(morie_concordance_incomplete) +export(morie_condorcet_winner) +export(morie_confusion_matrix_metrics) +export(morie_contingency_coefficient) +export(morie_control_comparison) +export(morie_control_median_test) +export(morie_control_variates) +export(morie_conv1d_forward) +export(morie_conv2d_forward) +export(morie_copula_estimation) +export(morie_cpads_contract) +export(morie_cramers_v) +export(morie_cutting_plane_sphere) export(morie_dataset_catalog) export(morie_dataset_info) export(morie_db_connect) +export(morie_dbscan_clustering) +export(morie_dcc_multivariate_garch) +export(morie_decision_tree_split) +export(morie_deep_learning_genomic) +export(morie_default_synthetic_name_map) +export(morie_default_workflow_map) +export(morie_design_effect) +export(morie_det_rng) +export(morie_det_rng_sha_hex) +export(morie_diffu_diffusion_forward) +export(morie_diffu_heat_diffusion) +export(morie_diffusion_forward) +export(morie_dimensionality_test) export(morie_download_bootstrap) +export(morie_dropout_forward) +export(morie_drpfw_dropout_forward) +export(morie_dynamic_wnominate) +export(morie_e_value) +export(morie_effective_sample_size) +export(morie_eg_coint) +export(morie_egarch_model) +export(morie_estimate_aipw) +export(morie_estimate_atc) +export(morie_estimate_ate) +export(morie_estimate_att) +export(morie_estimate_cate) +export(morie_estimate_g_computation) +export(morie_estimate_gate) +export(morie_estimate_irm) +export(morie_estimate_late) +export(morie_estimate_propensity_scores) +export(morie_eta_squared) +export(morie_ewma_volatility) +export(morie_extreme_value_gev) +export(morie_fairness_average_odds_difference) +export(morie_fairness_bias_amplification) +export(morie_fairness_demographic_parity) +export(morie_fairness_disparate_impact) +export(morie_fairness_equalized_odds) +export(morie_fairness_gini) +export(morie_fast_available) +export(morie_fauzi_bias_reduced_kdfe) +export(morie_fauzi_cvm_smoothed) +export(morie_fauzi_edgeworth_quantile) +export(morie_fauzi_h_decomposition) +export(morie_fauzi_higher_order_kernel) +export(morie_fauzi_kdfe_properties) +export(morie_fauzi_kernel_quantile_asymptotic) +export(morie_fauzi_ks_smoothed) +export(morie_fauzi_l_statistic) +export(morie_fauzi_mise_computation) +export(morie_fauzi_mrl_asymptotic) +export(morie_fauzi_mrl_boundary_free) +export(morie_fauzi_smoothed_sign) +export(morie_fauzi_smoothed_wilcoxon) +export(morie_fauzi_survival_kernel) +export(morie_fetch) +export(morie_fetch_arcgis) export(morie_fetch_ckan) export(morie_fetch_siu) export(morie_fetch_tps) +export(morie_find_project_root) +export(morie_fisher_exact_test) +export(morie_forward_pass_dense) +export(morie_fwpas_forward_pass_dense) +export(morie_gan_loss) +export(morie_ganls_gan_loss) +export(morie_garch_fit) +export(morie_gblup_full) +export(morie_generalized_pareto) export(morie_generate_ar_coefficients) +export(morie_generate_synthetic_data) export(morie_generate_var_coefficients) +export(morie_genomic_cross_validation) +export(morie_geographically_weighted_regression) +export(morie_ghosal_adaptation) +export(morie_ghosal_bernstein_von_mises) +export(morie_ghosal_contraction_rate) +export(morie_ghosal_dirichlet_posterior) +export(morie_ghosal_dpmixture_density) +export(morie_ghosal_empirical_bayes) +export(morie_ghosal_gp_matern) +export(morie_ghosal_gp_squared_exponential) +export(morie_ghosal_hierarchical_bayes) +export(morie_ghosal_log_density) +export(morie_ghosal_moment_matching) +export(morie_ghosal_neutral_right) +export(morie_ghosal_np_classification) +export(morie_ghosal_np_regression) +export(morie_ghosal_np_testing) +export(morie_ghosal_posterior_consistency) +export(morie_ghosal_sieve_prior) +export(morie_ghosal_stick_breaking_trunc) +export(morie_ghosal_survival_beta_process) +export(morie_ghosal_wavelet_prior) export(morie_gpl_compatible_licenses) +export(morie_gradient_boosting_ensemble) +export(morie_gradient_boosting_genomic) +export(morie_gradient_descent_vanilla) +export(morie_grid_search_cv) +export(morie_grm_vanraden) +export(morie_gru_cell) +export(morie_grucl_gru_cell) +export(morie_gxe_interaction_model) +export(morie_hawkes_fit) +export(morie_he_initialization) +export(morie_hedges_g) +export(morie_heinz_he_initialization) +export(morie_horowitz_average_derivative) +export(morie_horowitz_bandwidth_bootstrap) +export(morie_horowitz_binary_response) +export(morie_horowitz_censored_regression) +export(morie_horowitz_deconvolution) +export(morie_horowitz_duration_model) +export(morie_horowitz_index_model) +export(morie_horowitz_kernel_density) +export(morie_horowitz_kernel_regression) +export(morie_horowitz_local_ate) +export(morie_horowitz_local_linear) +export(morie_horowitz_mixture_model) +export(morie_horowitz_nonparametric_iv) +export(morie_horowitz_plr_bandwidth) +export(morie_horowitz_plr_estimator) +export(morie_horowitz_quantile_regression) +export(morie_horowitz_sample_selection) +export(morie_horowitz_smoothed_maximum_score) +export(morie_horowitz_treatment_effect) +export(morie_horowitz_wild_bootstrap) +export(morie_hurst_r) +export(morie_ideal_point_model) +export(morie_ideal_point_recovery) +export(morie_importance_sampling) +export(morie_indicator_kriging) +export(morie_infer_measurement_level) +export(morie_inspect_output) +export(morie_irt_spatial) +export(morie_is_over_legal_limit) +export(morie_isotonic_regression) +export(morie_jackknife_estimate) +export(morie_jackknife_estimator) +export(morie_johansen_cointegration) +export(morie_kalman_filter) +export(morie_kendall_tau) +export(morie_kendall_tau_partial) +export(morie_kmeans_clustering) +export(morie_kosorok_bootstrap_empirical) +export(morie_kosorok_bracketing_number) +export(morie_kosorok_censoring_survival) +export(morie_kosorok_counting_process) +export(morie_kosorok_cox_partial_likelihood) +export(morie_kosorok_donsker_class) +export(morie_kosorok_efficient_score) +export(morie_kosorok_empirical_process) +export(morie_kosorok_glivenko_cantelli) +export(morie_kosorok_influence_function) +export(morie_kosorok_information_bound) +export(morie_kosorok_m_estimator) +export(morie_kosorok_maximal_inequality) +export(morie_kosorok_multiplier_bootstrap) +export(morie_kosorok_nelson_aalen) +export(morie_kosorok_one_step_estimator) +export(morie_kosorok_profile_likelihood) +export(morie_kosorok_tangent_space) +export(morie_kosorok_vc_dimension) +export(morie_kosorok_z_estimator) +export(morie_kruskal_wallis_test) +export(morie_ksr01_kosorok_empirical_process) +export(morie_ksr02_kosorok_donsker_class) +export(morie_ksr03_kosorok_glivenko_cantelli) +export(morie_ksr04_kosorok_vc_dimension) +export(morie_ksr05_kosorok_bracketing_number) +export(morie_ksr06_kosorok_maximal_inequality) +export(morie_ksr07_kosorok_bootstrap_empirical) +export(morie_ksr08_kosorok_multiplier_bootstrap) +export(morie_ksr09_kosorok_z_estimator) +export(morie_ksr10_kosorok_m_estimator) +export(morie_ksr11_kosorok_efficient_score) +export(morie_ksr12_kosorok_information_bound) +export(morie_ksr13_kosorok_tangent_space) +export(morie_ksr14_kosorok_profile_likelihood) +export(morie_ksr15_kosorok_one_step_estimator) +export(morie_ksr16_kosorok_influence_function) +export(morie_ksr17_kosorok_counting_process) +export(morie_ksr18_kosorok_nelson_aalen) +export(morie_ksr19_kosorok_cox_partial_likelihood) +export(morie_ksr20_kosorok_censoring_survival) +export(morie_latin_hypercube) +export(morie_learning_curve) +export(morie_levene_test) export(morie_license_metadata) +export(morie_linear_regression_ols) export(morie_list_datasets) +export(morie_list_morie_modules) export(morie_load_cpads) +export(morie_load_cpads_data) export(morie_load_dataset) +export(morie_lstm_cell) +export(morie_lstmc_lstm_cell) +export(morie_mann_whitney_test) +export(morie_marker_variance) +export(morie_maxpool_forward) +export(morie_mds_spatial_map) +export(morie_median_voter) +export(morie_mhatf_multi_head_attention_full) +export(morie_midas_regression) +export(morie_mini_batch_gradient) +export(morie_monte_carlo_integration) +export(morie_multi_head_attention_full) +export(morie_multi_trait_gblup) +export(morie_multinomial_probit_spatial) export(morie_mvn_with_covariance) +export(morie_mxpol_maxpool_forward) +export(morie_nbeats_basis) +export(morie_nonstationary_covariance) +export(morie_odds_ratio_ci) +export(morie_omega_squared) +export(morie_one_sample_coverage) +export(morie_one_sample_t_test) +export(morie_optimal_classification) +export(morie_ordered_alternatives_test) +export(morie_ordered_categories) +export(morie_ordinary_kriging) +export(morie_paired_t_test) +export(morie_party_alignment) export(morie_paths) +export(morie_pca_dimension_reduction) +export(morie_pcg_filter) +export(morie_penalized_regression) +export(morie_penalized_spline) +export(morie_percentile_modified_rank) +export(morie_permutation_test_general) +export(morie_point_biserial_r) +export(morie_polarization_index) +export(morie_polynomial_regression) +export(morie_posab_positional_encoding_abs) +export(morie_positional_encoding_abs) +export(morie_power_prop_test) +export(morie_power_t_test) +export(morie_pps_sample) +export(morie_prediction_accuracy) +export(morie_predpol_aggregate_areas) +export(morie_predpol_calibration_audit) +export(morie_predpol_score_disparity) +export(morie_predpol_temporal_audit) +export(morie_profile_dataset) +export(morie_prophet_components) +export(morie_proportion_ci) +export(morie_quantile_function) +export(morie_random_forest_ensemble) +export(morie_random_forest_genomic) +export(morie_random_search_cv) +export(morie_rangayyan_adaptive_filter) +export(morie_rangayyan_approximate_entropy) +export(morie_rangayyan_ar_burg) +export(morie_rangayyan_coherence) +export(morie_rangayyan_correlation_dimension) +export(morie_rangayyan_dfa) +export(morie_rangayyan_eeg_bands) +export(morie_rangayyan_emg_rms) +export(morie_rangayyan_envelope) +export(morie_rangayyan_fir_filter) +export(morie_rangayyan_higuchi_fd) +export(morie_rangayyan_hrv) +export(morie_rangayyan_iir_filter) +export(morie_rangayyan_lyapunov) +export(morie_rangayyan_psd) +export(morie_rangayyan_qrs_detect) +export(morie_rangayyan_sample_entropy) +export(morie_rangayyan_stft) +export(morie_rangayyan_wavelet_denoise) +export(morie_rangayyan_zero_crossing) +export(morie_rank_based_test) +export(morie_rank_order_statistics) +export(morie_rank_placements) +export(morie_read_outputs_manifest) +export(morie_regime_switching) +export(morie_regularization_path) +export(morie_residual_connection) +export(morie_return_level) +export(morie_risk_difference_ci) +export(morie_risk_ratio_ci) +export(morie_rkhs_full) +export(morie_rkhs_kernel_regression) +export(morie_rnn_genomic) +export(morie_roc_auc_score) +export(morie_roll_call_analysis) +export(morie_rotary_position_embedding) +export(morie_rotrp_rotary_position_embedding) +export(morie_rslnk_residual_connection) +export(morie_run_ebac_selection_ipw_analysis) +export(morie_run_morie_module) +export(morie_run_morie_modules) +export(morie_run_pipeline) +export(morie_run_propensity_ipw_analysis) +export(morie_run_treatment_effects_analysis) +export(morie_run_weighted_logistic_analysis) +export(morie_run_workflow_step) export(morie_sample) +export(morie_sample_size_logistic) +export(morie_scaled_dot_product_attention) +export(morie_sensitivity_rosenbaum) +export(morie_sgolay_smooth) +export(morie_shapiro_wilk_test) +export(morie_sign_test_power) +export(morie_simple_random_sample) export(morie_simulate_longitudinal_panel) +export(morie_siu_anomaly_check) +export(morie_siu_audit_case) +export(morie_siu_audit_columns) +export(morie_siu_compare) +export(morie_siu_index) +export(morie_siu_llm_extract) +export(morie_siu_record_correction) +export(morie_siu_refresh_manifest) +export(morie_siu_sanity_check) +export(morie_siu_translate) +export(morie_siu_translate_fr_to_en) +export(morie_sobol_sequence) +export(morie_spatial_agreement) +export(morie_spatial_ar_error) +export(morie_spatial_ar_lag) +export(morie_spatial_autocorrelation) +export(morie_spatial_block_kriging) +export(morie_spatial_cross_validation) +export(morie_spatial_glm) +export(morie_spatial_mixed_model) +export(morie_spatial_trend_surface) +export(morie_spatiotemporal_autocovariance) +export(morie_spatiotemporal_kriging) +export(morie_spatiotemporal_variogram) +export(morie_spearman_rho) +export(morie_spectral_density) +export(morie_state_space_model) +export(morie_stratified_sample) +export(morie_stratified_sampling) +export(morie_suggest_analysis_plan) +export(morie_sukhatme_test) +export(morie_summarize_output_audit) +export(morie_svm_genomic) +export(morie_svm_hinge_primal) +export(morie_svm_kernel_trick) export(morie_sync_rng) +export(morie_terry_hoeffding_test) +export(morie_tgarch_model) +export(morie_thin_plate_spline) +export(morie_threshold_autoregression) +export(morie_tolerance_limits) export(morie_tps_layer_urls) +export(morie_transformer_block) +export(morie_transformer_genomic) +export(morie_trfbl_transformer_block) +export(morie_tsne_reduction) +export(morie_two_sample_coverage) +export(morie_two_sample_t_test) +export(morie_unfolding_analysis) +export(morie_universal_kriging) +export(morie_unobserved_components) export(morie_userguide) +export(morie_vae_elbo) +export(morie_vaenc_vae_elbo) +export(morie_validate_cpads_data) +export(morie_validate_outputs_manifest) +export(morie_van_der_waerden_test) +export(morie_variogram_estimation) +export(morie_variogram_fitting) +export(morie_vecm) +export(morie_verify_statistical_output) +export(morie_vine_copula) +export(morie_voting_power_index) +export(morie_wavelet_time_series) +export(morie_wilcoxon_power) +export(morie_wilcoxon_signed_rank_test) +export(morie_wnominate) +export(morie_wnominate_estimate) +export(morie_write_synthetic_data) +export(morie_xavier_initialization) +export(morie_xavir_xavier_init) +export(morie_xgboost_objective) export(mrm_anova_bonferroni) export(mrm_anova_oneway) export(mrm_anova_power) @@ -249,44 +519,11 @@ export(mrm_tps_polygon_moran_per_year) export(mrm_two_treatment_test) export(mrm_twoprop_test) export(mrm_var_test) -export(multi_trait_gblup) -export(mxpol_maxpool_forward) -export(nbeats_basis) export(nstat) -export(odds_ratio_ci) export(okrig) -export(omega_squared) -export(one_sample_coverage) -export(one_sample_t_test) export(optcl) -export(ordered_alternatives_test) -export(ordered_categories) -export(paired_t_test) -export(pca_dimension_reduction) -export(pcg_filter) -export(penalized_regression) -export(percentile_modified_rank) -export(point_biserial_r) export(polrz) -export(polynomial_regression) -export(posab_positional_encoding_abs) -export(power_prop_test) -export(power_t_test) -export(pps_sample) -export(prediction_accuracy) -export(profile_dataset) -export(prophet_components) -export(proportion_ci) -export(random_forest_ensemble) -export(random_forest_genomic) -export(random_search_cv) -export(rank_based_test) -export(rank_order_statistics) -export(rank_placements) export(rcall) -export(read_outputs_manifest) -export(regime_switching) -export(regularization_path) export(rgadp) export(rgapn) export(rgarb) @@ -307,83 +544,31 @@ export(rgsam) export(rgstf) export(rgwav) export(rgzcr) -export(risk_difference_ci) -export(risk_ratio_ci) -export(rkhs_full) -export(rnn_genomic) -export(roc_auc_score) -export(rotrp_rotary_position_embedding) -export(rslnk_residual_connection) -export(run_ebac_selection_ipw_analysis) -export(run_morie_module) -export(run_morie_modules) -export(run_pipeline) -export(run_propensity_ipw_analysis) -export(run_treatment_effects_analysis) -export(run_weighted_logistic_analysis) -export(run_workflow_step) -export(sample_size_logistic) export(sarla) export(sarre) -export(sensitivity_rosenbaum) export(sglm) -export(sgolay_smooth) -export(shapiro_wilk_test) -export(sign_test_power) -export(simple_random_sample) export(smixd) export(spblk) export(spcrs) -export(spearman_rho) -export(spectral_density) export(sptag) export(sptau) export(sptrn) export(stacv) -export(state_space_model) export(stkrg) -export(stratified_sample) export(stvar) -export(suggest_analysis_plan) -export(sukhatme_test) -export(summarize_output_audit) -export(svm_genomic) -export(svm_hinge_primal) -export(svm_kernel_trick) -export(terry_hoeffding_test) -export(tgarch_model) -export(threshold_autoregression) -export(tolerance_limits) -export(transformer_genomic) -export(trfbl_transformer_block) -export(tsne_reduction) -export(two_sample_coverage) -export(two_sample_t_test) export(ukrig) export(unfdl) -export(unobserved_components) -export(vaenc_vae_elbo) -export(validate_cpads_data) -export(validate_outputs_manifest) -export(van_der_waerden_test) -export(vecm) -export(verify_statistical_output) export(vrgft) export(vrgm) export(vtpwr) -export(wavelet_time_series) -export(wilcoxon_power) -export(wilcoxon_signed_rank_test) export(wnom) -export(write_synthetic_data) -export(xavir_xavier_init) -export(xgboost_objective) - +importFrom(Rcpp,sourceCpp) importFrom(splines,bs) importFrom(stats,acf) importFrom(stats,aggregate) importFrom(stats,ar) importFrom(stats,arima) +importFrom(stats,as.formula) importFrom(stats,ave) importFrom(stats,chisq.test) importFrom(stats,coef) @@ -405,6 +590,7 @@ importFrom(stats,p.adjust) importFrom(stats,pchisq) importFrom(stats,plogis) importFrom(stats,pnorm) +importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) @@ -419,178 +605,6 @@ importFrom(stats,weighted.mean) importFrom(stats,wilcox.test) importFrom(utils,combn) importFrom(utils,getFromNamespace) -importFrom(stats,as.formula) -importFrom(stats,predict) importFrom(utils,head) importFrom(utils,tail) - -# Added 2026-05-12 for v0.4.0 det-seed + entheo helpers -export(agenda_setter_power) -export(anisotropy_test) -export(antithetic_variates) -export(backpropagation) -export(batch_norm_forward) -export(bayesian_ideal_points) -export(bootstrap_ci) -export(bridge_observations) -export(cokriging) -export(condorcet_winner) -export(control_variates) -export(conv1d_forward) -export(conv2d_forward) -export(copula_estimation) -export(cutting_plane_sphere) -export(diffusion_forward) -export(dimensionality_test) -export(dropout_forward) -export(dynamic_wnominate) -export(extreme_value_gev) -export(fauzi_bias_reduced_kdfe) -export(fauzi_cvm_smoothed) -export(fauzi_edgeworth_quantile) -export(fauzi_h_decomposition) -export(fauzi_higher_order_kernel) -export(fauzi_kdfe_properties) -export(fauzi_kernel_quantile_asymptotic) -export(fauzi_ks_smoothed) -export(fauzi_l_statistic) -export(fauzi_mise_computation) -export(fauzi_mrl_asymptotic) -export(fauzi_mrl_boundary_free) -export(fauzi_smoothed_sign) -export(fauzi_smoothed_wilcoxon) -export(fauzi_survival_kernel) -export(forward_pass_dense) -export(gan_loss) -export(generalized_pareto) -export(geographically_weighted_regression) -export(gru_cell) -export(he_initialization) -export(horowitz_average_derivative) -export(horowitz_bandwidth_bootstrap) -export(horowitz_binary_response) -export(horowitz_censored_regression) -export(horowitz_deconvolution) -export(horowitz_duration_model) -export(horowitz_index_model) -export(horowitz_kernel_density) -export(horowitz_kernel_regression) -export(horowitz_local_ate) -export(horowitz_local_linear) -export(horowitz_mixture_model) -export(horowitz_nonparametric_iv) -export(horowitz_plr_bandwidth) -export(horowitz_plr_estimator) -export(horowitz_quantile_regression) -export(horowitz_sample_selection) -export(horowitz_smoothed_maximum_score) -export(horowitz_treatment_effect) -export(horowitz_wild_bootstrap) -export(ideal_point_model) -export(ideal_point_recovery) -export(importance_sampling) -export(indicator_kriging) -export(irt_spatial) -export(isotonic_regression) -export(jackknife_estimator) -export(kosorok_bootstrap_empirical) -export(kosorok_bracketing_number) -export(kosorok_censoring_survival) -export(kosorok_counting_process) -export(kosorok_cox_partial_likelihood) -export(kosorok_donsker_class) -export(kosorok_efficient_score) -export(kosorok_empirical_process) -export(kosorok_glivenko_cantelli) -export(kosorok_influence_function) -export(kosorok_information_bound) -export(kosorok_m_estimator) -export(kosorok_maximal_inequality) -export(kosorok_multiplier_bootstrap) -export(kosorok_nelson_aalen) -export(kosorok_one_step_estimator) -export(kosorok_profile_likelihood) -export(kosorok_tangent_space) -export(kosorok_vc_dimension) -export(kosorok_z_estimator) -export(latin_hypercube) -export(lstm_cell) -export(maxpool_forward) -export(mds_spatial_map) -export(median_voter) -export(monte_carlo_integration) -export(morie_det_rng) -export(morie_det_rng_sha_hex) -export(multi_head_attention_full) -export(multinomial_probit_spatial) -export(nonstationary_covariance) -export(optimal_classification) -export(ordinary_kriging) -export(party_alignment) -export(penalized_spline) -export(permutation_test_general) -export(polarization_index) -export(positional_encoding_abs) -export(quantile_function) -export(rangayyan_adaptive_filter) -export(rangayyan_approximate_entropy) -export(rangayyan_ar_burg) -export(rangayyan_coherence) -export(rangayyan_correlation_dimension) -export(rangayyan_dfa) -export(rangayyan_eeg_bands) -export(rangayyan_emg_rms) -export(rangayyan_envelope) -export(rangayyan_fir_filter) -export(rangayyan_higuchi_fd) -export(rangayyan_hrv) -export(rangayyan_iir_filter) -export(rangayyan_lyapunov) -export(rangayyan_psd) -export(rangayyan_qrs_detect) -export(rangayyan_sample_entropy) -export(rangayyan_stft) -export(rangayyan_wavelet_denoise) -export(rangayyan_zero_crossing) -export(residual_connection) -export(return_level) -export(rkhs_kernel_regression) -export(roll_call_analysis) -export(rotary_position_embedding) -export(scaled_dot_product_attention) -export(sobol_sequence) -export(spatial_agreement) -export(spatial_ar_error) -export(spatial_ar_lag) -export(spatial_autocorrelation) -export(spatial_block_kriging) -export(spatial_cross_validation) -export(spatial_glm) -export(spatial_mixed_model) -export(spatial_trend_surface) -export(spatiotemporal_autocovariance) -export(spatiotemporal_kriging) -export(spatiotemporal_variogram) -export(stratified_sampling) -export(thin_plate_spline) -export(transformer_block) -export(unfolding_analysis) -export(universal_kriging) -export(vae_elbo) -export(variogram_estimation) -export(variogram_fitting) -export(vine_copula) -export(voting_power_index) -export(wnominate) -export(wnominate_estimate) -export(xavier_initialization) -export(fairness_disparate_impact) -export(fairness_demographic_parity) -export(fairness_equalized_odds) -export(fairness_average_odds_difference) -export(fairness_gini) -export(fairness_bias_amplification) -export(predpol_aggregate_areas) -export(predpol_calibration_audit) -export(predpol_score_disparity) -export(predpol_temporal_audit) +useDynLib(morie, .registration = TRUE) diff --git a/r-package/morie/NEWS.md b/r-package/morie/NEWS.md index 25d7e73e3d..09f5977848 100644 --- a/r-package/morie/NEWS.md +++ b/r-package/morie/NEWS.md @@ -1,3 +1,327 @@ +# morie 0.9.5.2 - 2026-05-21 + +* **HTML validation fix.** `morie_siu_sanity_check`'s description + used `date_*_iso` and `number_of_*` as inline text, which + roxygen2's markdown mode rendered as nested `\emph{\emph{...}}` + in the generated Rd and as nested `` in the HTML manual. + win-builder flagged this as an HTML validation NOTE. Wrapping + the identifiers in backticks (now rendered as `\verb{...}`) + resolves it. +* All other fixes are inherited from 0.9.5.1: see entry below. + +# morie 0.9.5.1 - 2026-05-21 + +CRAN Policy: full cache-leak fix (supersedes 0.9.5 which was +uploaded to win-builder with incomplete cache-isolation). + +* `morie_db_connect()` default cache-dir flipped from + `tools::R_user_dir("morie", "cache")` to a session-scoped + `tempdir()` subdirectory; matches the convention already set + for `morie_fetch_siu()` and `morie_fetch_tps()` in 0.9.5. Now + no morie function writes outside `tempdir()` unless the user + explicitly opts in by passing `db_path = morie_cache_dir(...)` + or `cache_dir = morie_cache_dir(...)`. +* New `morie_cache_clear(subdir, confirm)` user-facing function + for actively-managing the persistent cache (CRAN Policy + requirement for `R_user_dir` caches). +* `morie_cache_dir(subdir)` is now exported with a `subdir` + argument so users can compose per-subsystem persistent paths. +* 3 `morie_cache_*` examples (`store`, `load`, `list`) now use + explicit `db_path = tempfile()` so `R CMD check` never writes + outside `tempdir()`. +* `morie_check_plugin_license` error-path example moved from + `\donttest{}` to `\dontrun{}` (intentionally errors when + passed an incompatible SPDX). +* `morie_fetch` placeholder-URL example moved from + `\donttest{}` to `\dontrun{}` (example.org doesn't host CSV; + the URL is a documentation placeholder). +* Two `crimsl.utoronto.ca` references in `R/mandela.R` and + `R/morie-package.R` rewritten as plain-text references; the + U of T web server returns 403 to win-builder's IP even though + the URLs are publicly reachable from browsers. +* New `inst/WORDLIST` listing real technical terms (AIPW, ATC, + ATT, CATE, Hawkes, MRM, etc.) so the win-builder + spell-checker no longer flags them. + +# morie 0.9.5 — 2026-05-21 + +Documentation + CI hardening (added 2026-05-21 to the v0.9.5 +release branch alongside the SIU + rename work): + +* **New SIU vignette** (`vignettes/siu-pipeline.Rmd`) — end-to-end + walkthrough of `morie_fetch_siu()`, `morie_siu_audit_case()`, + `morie_siu_anomaly_check()`, `morie_siu_compare()`, + `morie_siu_llm_extract()`, `morie_siu_translate()`, and the + canonical-override system. 14 vignettes total now. +* **Chi-square vignette correction.** `vignettes/chi-square-and-anova.Rmd` + previously called the MRM chi-square family the "Doob $\chi^{2}$ + family", which incorrectly singled out one of the three named + authors (Sprott, Doob, Iftene) of the source contingency tables. + Renamed to "MRM chi-square family". The Sprott / Doob / Iftene + author citation to the source tables is unchanged. +* **`_pkgdown.yml` shipped** — a minimal pkgdown configuration so + contributors can build a local documentation site with + `pkgdown::build_site()`. The file is `.Rbuildignore`d so it + doesn't ship in the CRAN tarball. +* **README rewrite** (top-level + R-package) to reflect v0.9.5 + reality: 559 morie-prefixed exports (not 87), the SIU subsystem, + free-first AI helpers (Ollama default), language-aware DRID + manifest, canonical-override system, polite-by-default fetcher, + and the green 6-cell R CMD check matrix. +* **pkgcheck workflow: `inconsolata` LaTeX font installed.** + pkgcheck's internal rcmdcheck builds the PDF manual, which + needs `inconsolata.sty`. Without it pkgcheck reported a spurious + "R CMD check found 1 warning" against a package that has 0 + warnings in the dedicated `r-cmd-check.yml` matrix. The + pkgcheck job now installs tinytex + inconsolata before running. + +lintr / goodpractice cleanups: + +* The Hawkes C++ likelihood functions now use `T_horizon` instead + of `T` for the time-horizon parameter, so the auto-generated + `R/RcppExports.R` no longer trips R linters that flag `T` as a + potential `TRUE` shadow. The math convention is preserved in the + C++ docstrings; only the parameter NAME changed. +* `setwd()` in `morie_run_workflow_step()` replaced with + `withr::local_dir()` (goodpractice no-setwd linter). +* 352+ exported functions renamed to the `morie_*` prefix so they + no longer collide with same-named functions in other CRAN + packages. Examples: `chi_square_test` → `morie_chi_square_test`, + `kmeans_clustering` → `morie_kmeans_clustering`, etc. Names that + were already morie-specific cryptic abbreviations (agset, brdgr, + fzhdc, …) are unchanged. + +SIU harvester: polite by default, manifest-aware, retry-aware, and +auditable against the original published reports. + +* **Persistent HTML cache + per-case audit.** `morie_fetch_siu(cache_html + = TRUE)` saves every fetched report and news-release page under + `/html/` (gzipped, ~80-100 MB for a full sweep). The + saved HTML is the canonical ground truth for every row in the + emitted CSV: any later question of the form "did the parser get + this field right?" is decidable by reading the cached page for + that case. `morie_siu_audit_case(case_number)` returns the + parser's 1-row data frame, the raw report and news HTML, and + HTML-stripped plain text for both, all from cache when available. +* **`morie_siu_compare()`** — line up the parser's output for a + case against a user-supplied external table (column map and case + key are caller-controlled) and show the surrounding report HTML + excerpt for each disagreement. No external source is treated as + authoritative; the function exists so the user can adjudicate + parser-vs-external mismatches against the actual published + report. The published report HTML is the only ground truth morie + recognises for SIU fields. +* **Free by default.** The LLM helpers now default to + \code{model = c("ollama", "gemini")} -- a free local Ollama + model first, with paid Gemini as fallback only if Ollama is + unavailable. Users who install Ollama and pull a free Gemma / + Qwen / Llama / Functiongemma variant + (\code{ollama pull gemma3:4b}) get the full second-coder / + audit / anomaly-check stack at \$0 ongoing cost. \code{OLLAMA_HOST} + defaults to \code{http://localhost:11434} when unset, so the + zero-config path is just "install ollama, pull a model, done". +* **AI second-coder (Gemini / Claude / Ollama).** + `morie_siu_llm_extract(case_number, model = "gemini")` sends the + cached report HTML through a large-language-model endpoint and + returns the same 64-column row format as the C++ parser, so it + drops straight into `morie_siu_compare(external = ...)` for an + independent diff. `model` accepts a character vector for + fail-over, e.g. `c("gemini", "ollama")` uses the paid Gemini + endpoint when available and silently falls back to a local / + free Ollama-compatible model otherwise. Credentials are read + from `GOOGLE_API_KEY` / `ANTHROPIC_API_KEY` / `OLLAMA_HOST`; + nothing is hard-coded. +* **`morie_siu_translate_fr_to_en()`** — self-improving SIU. + For SIU cases that exist only in French (no English-language + paired drid; ~1-2 per year of SIU output), translate the + narrative_summary, news_release_summary, news_release_title and + relevant_legislation into English via a local Ollama model + (default $0 cost, no API key needed) and persist each + translation as a canonical override via + \code{morie_siu_record_correction()}. Idempotent (skips + already-translated cases) and self-improving (every run leaves + morie better at returning English content for French-only + reports). Maintainers can promote the resulting overrides into + the shipped \code{inst/extdata/siu_canonical_overrides.csv.gz} + so all users get the English text on the next package update. +* **French police-service acronyms.** The modal-service detector + now also recognises SPT (Service de Police de Toronto), PPO + (Police provinciale de l'Ontario), SPRH (Halton), SPRY (York), + SPRP (Peel), SPRD (Durham), SPRN (Niagara), SPRW (Waterloo), + SPO (Ottawa), SPL (London), SPH (Hamilton), SPW (Windsor), SPG + (Guelph), SPK (Kingston) and maps each to the canonical English + name. Closes the remaining French-only-case gap; 12-TFD-104 in + the 2012 corpus now reports \code{Toronto Police Service} + correctly. +* **99.955% format-clean on the full 2,218-case corpus.** + Empirical measurement via `morie_siu_sanity_check()` on the + freshly-harvested SIU.csv: 2,217 / 2,218 rows have zero format + issues; the lone remaining case is a 2012 French-only report + (12-TFD-104) without an English-paired drid. The earlier + 95.45% baseline ate four further fixes: (a) Unicode apostrophe + / quote / dash normalisation in `lower_ascii()` so the title- + finder matches "Director's report" (U+2019) cleanly, (b) + "Overview" as a section_4 fallback for 2014 reports that + retitled "The Investigation", (c) French "L'enquête" / "Aperçu" + fallbacks for French-only reports, (d) full SIU police-service + acronym table (OPP, TPS, HRPS, NRPS, PRP, YRP, DRPS, WRPS, OPS, + LPS, WPS, GPS, KPS, BPS, BPPS, CKPS, PRPS, GSPS, SSMPS, SLPS, + SPS, TBPS, BPSB) -- old reports use the acronym throughout and + never spell out "Ontario Provincial Police", and the modal- + service detector now picks up "OPP" → "Ontario Provincial + Police" automatically. +* **Interleaved report + news fetch.** `morie_fetch_siu()` no longer + walks the corpus in two strict phases (fetch all reports, then + fetch all news). It now uses a rolling-window batched fetcher: + each batch of 250 reports fires in the same rate-limited pool + as the previous batch's news pages. While the next 250 reports + are downloading, the news pages for the nrids we just parsed + are downloading alongside. Roughly halves cold-start corpus + wall time (~30 min instead of ~58 min on the full 4,700-drid + sweep) without changing the per-second rate the SIU site sees. +* **Canonical overrides — the parser LEARNS from corrections.** + Every verified \code{(case_number, field, value)} tuple recorded + via \code{morie_siu_record_correction()} is applied to + \code{morie_fetch_siu()}'s output on subsequent runs. The shipped + \code{inst/extdata/siu_canonical_overrides.csv.gz} holds the + maintainer-confirmed table (starts empty in v0.9.5, populated by + the LLM-audit + human-review workflow over time). The + user-side \code{/canonical_overrides.csv} merges in + too -- users can fix their local copy without touching the + package source. This is morie's "memory": wrong cells get found + via \code{morie_siu_sanity_check()} or + \code{morie_siu_audit_columns()}, corrected once, and the fix + propagates to all users on the next package update -- no C++ + rebuild needed. +* **`morie_siu_sanity_check()`** — fast format-validity pass over + every row of an emitted SIU table. Flags case_number that + doesn't look like an SIU id, date_*_iso that isn't ISO 8601, + number_of_* that isn't a positive integer, charges_recommended + that isn't "Yes"/"No", page-chrome strings leaked into + narrative_summary or other content fields, etc. Returns a data + frame ordered worst-first so maintainers can pop the cached + HTML for any flagged row and adjudicate. Runs in milliseconds, + no network, no LLM, no API key required. +* **`morie_siu_audit_columns()`** — closed-loop per-column accuracy + audit. Runs the anomaly check across many cases and aggregates + by field, returning a data frame sorted by agreement rate + (worst first) so maintainers can prioritise which regex + extraction pattern to fix next. Concrete disagreement examples + for each field are attached as the \code{"examples"} attribute. + With \code{model = "ollama"} pointed at a local Gemma / Qwen / + DeepSeek instance the audit costs zero API spend; chain + \code{c("gemini", "ollama")} for paid-first / free-fallback. +* **`morie_siu_anomaly_check()`** — per-field "does the report + support this extraction?" audit. Sends one API call per case + (all populated fields batched into a single prompt) and returns + a data frame with `field`, `parser_value`, `verdict` + (\code{"agree"} / \code{"disagree"} / \code{"unclear"}), and a + one-sentence reason. Not authoritative -- the cached HTML is + the ground truth -- but a fast way to triage which rows a human + should re-read against the report. +* **Section-text terminator fix (parser correctness).** The + `section_text()` helper used to stop only at the next `

`, + so the LAST `

` block on a page (typically + section_8 -- analysis / decision) silently captured everything + to end-of-document, including the site's left-nav and footer. + This leaked phrases like "First Nations, Inuit and Métis + Liaison Program" and Twitter follow links into every report's + `narrative_summary`, `supplemental_materials`, and + `mental_health_or_race_indications` -- the latter would have + tagged every case in Ontario as "First Nation" regardless of + the report's actual content. The terminator now also stops at + ` 0) min(s, iq) else s if (sigma <= 0) sigma <- 1 - 1.06 * sigma * n^(-1/5) + 1.06 * sigma * n^(-1 / 5) } diff --git a/r-package/morie/R/aaa_helpers_ghosal_bnp.R b/r-package/morie/R/aaa_helpers_ghosal_bnp.R index d12f5a42ec..536684078d 100644 --- a/r-package/morie/R/aaa_helpers_ghosal_bnp.R +++ b/r-package/morie/R/aaa_helpers_ghosal_bnp.R @@ -22,13 +22,16 @@ NULL } .gh_surv_post <- function(t, ev, c, lam0) { - t <- as.numeric(t); n <- length(t) - if (n == 0) return(NULL) + t <- as.numeric(t) + n <- length(t) + if (n == 0) { + return(NULL) + } if (is.null(ev)) ev <- rep(1L, n) if (is.null(lam0)) lam0 <- 1 / max(mean(t), 1e-6) uniq <- sort(unique(t)) - Y <- sapply(uniq, function(tk) sum(t >= tk)) - dN <- sapply(uniq, function(tk) sum(t == tk & ev == 1)) + Y <- vapply(uniq, function(tk) sum(t >= tk), numeric(1)) + dN <- vapply(uniq, function(tk) sum(t == tk & ev == 1), numeric(1)) dH0 <- diff(c(0, uniq)) * lam0 dHp <- (c * dH0 + dN) / (c + Y) S <- cumprod(1 - pmin(dHp, 1 - 1e-12)) @@ -36,9 +39,11 @@ NULL } .gh_haar_dwt <- function(y) { - L <- 1L; while (L < length(y)) L <- 2L * L + L <- 1L + while (L < length(y)) L <- 2L * L if (L > length(y)) y <- c(y, rep(0, L - length(y))) - coeffs <- list(); cur <- y + coeffs <- list() + cur <- y while (length(cur) > 1L) { a <- (cur[seq(1, length(cur), by = 2)] + cur[seq(2, length(cur), by = 2)]) / sqrt(2) d <- (cur[seq(1, length(cur), by = 2)] - cur[seq(2, length(cur), by = 2)]) / sqrt(2) diff --git a/r-package/morie/R/aaa_helpers_horowitz.R b/r-package/morie/R/aaa_helpers_horowitz.R index 533cfaadf5..e19f6f7db3 100644 --- a/r-package/morie/R/aaa_helpers_horowitz.R +++ b/r-package/morie/R/aaa_helpers_horowitz.R @@ -9,12 +9,14 @@ NULL .hrz_silverman <- function(x) { x <- as.numeric(x) n <- length(x) - if (n < 2L) return(1.0) + if (n < 2L) { + return(1.0) + } s <- stats::sd(x) iqr <- diff(stats::quantile(x, c(0.25, 0.75), na.rm = TRUE)) sigma <- if (iqr > 0) min(s, iqr / 1.349) else s if (sigma <= 0) sigma <- max(s, 1e-6) - unname(1.06 * sigma * n ^ (-1/5)) + unname(1.06 * sigma * n^(-1 / 5)) } @@ -29,7 +31,8 @@ NULL u <- outer(z, z, `-`) / h w <- exp(-0.5 * u^2) } else { - n <- nrow(z); w <- matrix(0, n, n) + n <- nrow(z) + w <- matrix(0, n, n) for (j in seq_len(ncol(z))) { u <- outer(z[, j], z[, j], `-`) / h w <- w + u^2 @@ -37,22 +40,26 @@ NULL w <- exp(-0.5 * w) } diag(w) <- 0 - wsum <- rowSums(w); safe <- ifelse(wsum > 0, wsum, 1) + wsum <- rowSums(w) + safe <- ifelse(wsum > 0, wsum, 1) as.numeric((w %*% y) / safe) } .hrz_probit_newton <- function(D, Z, maxit = 50, tol = 1e-8) { - q <- ncol(Z); beta <- rep(0, q) + q <- ncol(Z) + beta <- rep(0, q) for (k in 1:maxit) { eta <- pmin(pmax(as.numeric(Z %*% beta), -50), 50) - p <- stats::pnorm(eta); phi <- stats::dnorm(eta) + p <- stats::pnorm(eta) + phi <- stats::dnorm(eta) w <- phi * (D - p) / pmax(p * (1 - p), 1e-8) Hd <- phi^2 / pmax(p * (1 - p), 1e-8) g <- t(Z) %*% w H <- t(Z) %*% (Z * Hd) step <- tryCatch(solve(H + 1e-8 * diag(q), g), - error = function(e) MASS::ginv(H) %*% g) + error = function(e) MASS::ginv(H) %*% g + ) beta <- beta + step if (max(abs(step)) < tol) break } @@ -61,7 +68,8 @@ NULL .hrz_logit_newton <- function(D, X, maxit = 50, tol = 1e-8) { - p <- ncol(X); beta <- rep(0, p) + p <- ncol(X) + beta <- rep(0, p) for (k in 1:maxit) { eta <- pmin(pmax(as.numeric(X %*% beta), -50), 50) mu <- 1 / (1 + exp(-eta)) @@ -69,7 +77,8 @@ NULL g <- t(X) %*% (D - mu) H <- t(X) %*% (X * W) step <- tryCatch(solve(H + 1e-8 * diag(p), g), - error = function(e) MASS::ginv(H) %*% g) + error = function(e) MASS::ginv(H) %*% g + ) beta <- beta + step if (max(abs(step)) < tol) break } @@ -82,11 +91,16 @@ NULL for (k in 1:maxit) { r <- y - X %*% beta w <- ifelse(r > 0, tau / pmax(r, 1e-6), - (1 - tau) / pmax(-r, 1e-6)) + (1 - tau) / pmax(-r, 1e-6) + ) w <- as.numeric(w) new <- tryCatch(solve(t(X) %*% (X * w), t(X) %*% (w * y)), - error = function(e) MASS::ginv(t(X) %*% (X * w)) %*% (t(X) %*% (w * y))) - if (max(abs(new - beta)) < tol) { beta <- new; break } + error = function(e) MASS::ginv(t(X) %*% (X * w)) %*% (t(X) %*% (w * y)) + ) + if (max(abs(new - beta)) < tol) { + beta <- new + break + } beta <- new } as.numeric(beta) @@ -94,7 +108,8 @@ NULL .hrz_hermite <- function(t, J) { - n <- length(t); H <- matrix(0, n, J) + n <- length(t) + H <- matrix(0, n, J) H[, 1] <- 1 if (J > 1) H[, 2] <- t if (J > 2) for (k in 3:J) H[, k] <- t * H[, k - 1] - (k - 2) * H[, k - 2] diff --git a/r-package/morie/R/aaa_helpers_llm_arch.R b/r-package/morie/R/aaa_helpers_llm_arch.R index 17eb5cce4d..02646dfaff 100644 --- a/r-package/morie/R/aaa_helpers_llm_arch.R +++ b/r-package/morie/R/aaa_helpers_llm_arch.R @@ -8,15 +8,18 @@ NULL .softmax_last <- function(x) { # softmax along the last axis of an array - d <- dim(x); nd <- length(d) + d <- dim(x) + nd <- length(d) if (is.null(d) || nd == 1L) { x <- x - max(x) e <- exp(x) return(e / sum(e)) } - apply(x, seq_len(nd - 1L), function(v) { - v <- v - max(v); e <- exp(v); e / sum(e) - }) -> out + out <- apply(x, seq_len(nd - 1L), function(v) { + v <- v - max(v) + e <- exp(v) + e / sum(e) + }) # apply collapses last axis to first; transpose back aperm(out, c(seq.int(2L, nd), 1L)) } diff --git a/r-package/morie/R/aaa_helpers_montesinos.R b/r-package/morie/R/aaa_helpers_montesinos.R index d446173f2c..c2fdb8d814 100644 --- a/r-package/morie/R/aaa_helpers_montesinos.R +++ b/r-package/morie/R/aaa_helpers_montesinos.R @@ -16,33 +16,37 @@ #' @return Named list with estimate (G matrix), diag_mean, off_mean, p, n, m, method. #' @references VanRaden (2008) J Dairy Sci 91:4414. Montesinos Lopez Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_grm_vanraden(markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -grm_vanraden <- function(markers, method = 1) { - M <- as.matrix(markers); storage.mode(M) <- "double" - n <- nrow(M); m <- ncol(M) +morie_grm_vanraden <- function(markers, method = 1) { + M <- as.matrix(markers) + storage.mode(M) <- "double" + n <- nrow(M) + m <- ncol(M) p <- colMeans(M) / 2 Z <- sweep(M, 2, 2 * p, "-") if (identical(method, 2)) { - s <- sqrt(2 * p * (1 - p)); s[s <= 0] <- 1 + s <- sqrt(2 * p * (1 - p)) + s[s <= 0] <- 1 Z <- sweep(Z, 2, s, "/") denom <- m method_str <- "VanRaden method 2 (per-locus scaled)" } else { - denom <- 2 * sum(p * (1 - p)); if (denom <= 0) denom <- 1 + denom <- 2 * sum(p * (1 - p)) + if (denom <= 0) denom <- 1 method_str <- "VanRaden method 1 (sum-2pq)" } G <- tcrossprod(Z) / denom diag_mean <- mean(diag(G)) - off <- G; diag(off) <- 0 + off <- G + diag(off) <- 0 off_mean <- if (n > 1) sum(off) / (n * (n - 1)) else 0 - list(estimate = G, diag_mean = diag_mean, off_mean = off_mean, - p = p, n = n, m = m, method = method_str) + list( + estimate = G, diag_mean = diag_mean, off_mean = off_mean, + p = p, n = n, m = m, method = method_str + ) } # CANONICAL TEST # set.seed(0); M <- matrix(sample(0:2, 20, TRUE), 4, 5) -# grm_vanraden(M)$diag_mean # ~1 in expectation +# morie_grm_vanraden(M)$diag_mean # ~1 in expectation diff --git a/r-package/morie/R/aaa_helpers_samples.R b/r-package/morie/R/aaa_helpers_samples.R deleted file mode 100644 index 807db71999..0000000000 --- a/r-package/morie/R/aaa_helpers_samples.R +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -#' Load a bundled reference sample CSV by name -#' -#' Parity with the Python \code{morie.load_sample()} helper. Reads a -#' small reference dataset shipped under \code{inst/extdata/samples/}. -#' -#' @param name One of \code{"otis_b01"}, \code{"otis_b09"}, -#' \code{"otis_c11"}, \code{"tps_assault"}. -#' @return A \code{data.frame} loaded from the bundled CSV. -#' @examples -#' df <- morie_sample("otis_b01") -#' head(df) -#' @export -morie_sample <- function(name) { - files <- list( - otis_b01 = "otis_b01_sample.csv", - otis_b09 = "otis_b09_sample.csv", - otis_c11 = "otis_c11_sample.csv", - tps_assault = "tps_assault_sample.csv" - ) - if (!name %in% names(files)) { - stop(sprintf("Unknown sample '%s'; choices: %s", name, - paste(names(files), collapse = ", "))) - } - path <- system.file("extdata", "samples", files[[name]], package = "morie") - if (path == "") { - stop(sprintf("Sample CSV for '%s' not found in installed package.", name)) - } - utils::read.csv(path) -} diff --git a/r-package/morie/R/accgp.R b/r-package/morie/R/accgp.R index 9d12bee3ed..349fbe95a5 100644 --- a/r-package/morie/R/accgp.R +++ b/r-package/morie/R/accgp.R @@ -7,39 +7,55 @@ #' #' @param y_true Numeric observed. #' @param y_pred Numeric predicted. -#' @return list(estimate (Pearson r), pearson_r, spearman_rho, mse, mspe, +#' @return list(estimate (Pearson r), pearson_r, morie_spearman_rho, mse, mspe, #' rmse, r2, slope, intercept, n, method). #' @references Montesinos Lopez Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_prediction_accuracy(y_true = rbinom(50, 1, 0.5), y_pred = rbinom(50, 1, 0.5)) #' @export -prediction_accuracy <- function(y_true, y_pred) { - y_true <- as.numeric(y_true); y_pred <- as.numeric(y_pred) +morie_prediction_accuracy <- function(y_true, y_pred) { + y_true <- as.numeric(y_true) + y_pred <- as.numeric(y_pred) n <- length(y_true) if (n != length(y_pred)) stop("y_true and y_pred must be same length") - if (n < 2) return(list(estimate = NA_real_, n = n, - method = "Genomic prediction accuracy (n<2)")) + if (n < 2) { + return(list( + estimate = NA_real_, n = n, + method = "Genomic prediction accuracy (n<2)" + )) + } mse <- mean((y_true - y_pred)^2) rmse <- sqrt(mse) var_y <- stats::var(y_true) r2 <- if (var_y > 0) 1 - mse / var_y else NA_real_ - r <- if (stats::sd(y_true) > 0 && stats::sd(y_pred) > 0) - stats::cor(y_true, y_pred) else NA_real_ - rho <- if (stats::sd(y_true) > 0 && stats::sd(y_pred) > 0) - stats::cor(y_true, y_pred, method = "spearman") else NA_real_ - slope <- if (stats::var(y_pred) > 0) - stats::cov(y_true, y_pred) / stats::var(y_pred) else NA_real_ - intercept <- if (!is.na(slope)) mean(y_true) - slope * mean(y_pred) - else NA_real_ - list(estimate = r, pearson_r = r, spearman_rho = rho, - mse = mse, mspe = mse, rmse = rmse, r2 = r2, - slope = slope, intercept = intercept, - n = n, method = "Pearson r + Spearman rho + MSE/MSPE + calibration") + r <- if (stats::sd(y_true) > 0 && stats::sd(y_pred) > 0) { + stats::cor(y_true, y_pred) + } else { + NA_real_ + } + rho <- if (stats::sd(y_true) > 0 && stats::sd(y_pred) > 0) { + stats::cor(y_true, y_pred, method = "spearman") + } else { + NA_real_ + } + slope <- if (stats::var(y_pred) > 0) { + stats::cov(y_true, y_pred) / stats::var(y_pred) + } else { + NA_real_ + } + intercept <- if (!is.na(slope)) { + mean(y_true) - slope * mean(y_pred) + } else { + NA_real_ + } + list( + estimate = r, pearson_r = r, morie_spearman_rho = rho, + mse = mse, mspe = mse, rmse = rmse, r2 = r2, + slope = slope, intercept = intercept, + n = n, method = "Pearson r + Spearman rho + MSE/MSPE + calibration" + ) } # CANONICAL TEST # y <- c(1,2,3,4,5); y_hat <- c(1.1,1.9,3.2,3.8,5.1) -# prediction_accuracy(y, y_hat)$pearson_r +# morie_prediction_accuracy(y, y_hat)$pearson_r diff --git a/r-package/morie/R/agset.R b/r-package/morie/R/agset.R index d370d1593e..076ce88bb9 100644 --- a/r-package/morie/R/agset.R +++ b/r-package/morie/R/agset.R @@ -12,38 +12,42 @@ #' @return Named list with `chosen`, `power`, `setter_ideal`, #' `reversion`, `win_set_size`, `win_set_bounds`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export agset <- function(options, setter_ideal, reversion) { options <- as.numeric(options) setter_ideal <- as.numeric(setter_ideal)[1] reversion <- as.numeric(reversion)[1] - if (length(options) == 0L) - return(list(chosen = NA_real_, power = 0, - setter_ideal = setter_ideal, reversion = reversion, - win_set_size = 0L, - win_set_bounds = c(NA_real_, NA_real_), - method = "agenda_setter_power")) + if (length(options) == 0L) { + return(list( + chosen = NA_real_, power = 0, + setter_ideal = setter_ideal, reversion = reversion, + win_set_size = 0L, + win_set_bounds = c(NA_real_, NA_real_), + method = "morie_agenda_setter_power" + )) + } median_voter_pt <- (setter_ideal + reversion) / 2 win_lo <- min(reversion, 2 * median_voter_pt - reversion) win_hi <- max(reversion, 2 * median_voter_pt - reversion) in_win <- options >= win_lo & options <= win_hi - chosen <- if (!any(in_win)) reversion - else { - feas <- options[in_win] - feas[which.min(abs(feas - setter_ideal))] - } - list(chosen = chosen, power = abs(chosen - reversion), - setter_ideal = setter_ideal, reversion = reversion, - win_set_size = as.integer(sum(in_win)), - win_set_bounds = c(win_lo, win_hi), - method = "agenda_setter_power") + chosen <- if (!any(in_win)) { + reversion + } else { + feas <- options[in_win] + feas[which.min(abs(feas - setter_ideal))] + } + list( + chosen = chosen, power = abs(chosen - reversion), + setter_ideal = setter_ideal, reversion = reversion, + win_set_size = as.integer(sum(in_win)), + win_set_bounds = c(win_lo, win_hi), + method = "morie_agenda_setter_power" + ) } #' @keywords internal #' @rdname agset #' @export -agenda_setter_power <- agset +morie_agenda_setter_power <- agset diff --git a/r-package/morie/R/algnm.R b/r-package/morie/R/algnm.R index 15b3e21125..11be52dd72 100644 --- a/r-package/morie/R/algnm.R +++ b/r-package/morie/R/algnm.R @@ -11,29 +11,35 @@ #' @return Named list with `estimate`, `per_party` (if 2-D), `n`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' algnm(x = rnorm(50)) #' @export algnm <- function(x, party = NULL) { X <- if (is.matrix(x)) x else as.numeric(x) if (!is.matrix(X)) { valid <- X[!is.na(X)] - if (length(valid) == 0L) + if (length(valid) == 0L) { return(list(estimate = NA_real_, n = 0L, method = "rice_cohesion")) - p_yea <- mean(valid == 1); p_nay <- mean(valid == 0) - return(list(estimate = abs(p_yea - p_nay), pct_yea = p_yea, - pct_nay = p_nay, n = length(valid), - method = "rice_cohesion")) + } + p_yea <- mean(valid == 1) + p_nay <- mean(valid == 0) + return(list( + estimate = abs(p_yea - p_nay), pct_yea = p_yea, + pct_nay = p_nay, n = length(valid), + method = "rice_cohesion" + )) } - n <- nrow(X); m <- ncol(X) + n <- nrow(X) + m <- ncol(X) per <- list() rice_for <- function(sub) { vapply(seq_len(m), function(j) { - col <- sub[, j]; col <- col[!is.na(col)] - if (length(col) == 0L) NA_real_ - else abs(mean(col == 1) - mean(col == 0)) + col <- sub[, j] + col <- col[!is.na(col)] + if (length(col) == 0L) { + NA_real_ + } else { + abs(mean(col == 1) - mean(col == 0)) + } }, numeric(1)) } if (is.null(party)) { @@ -44,15 +50,18 @@ algnm <- function(x, party = NULL) { if (length(party) != n) stop("party length must match n rows") for (lbl in unique(party)) { per[[as.character(lbl)]] <- mean(rice_for(X[party == lbl, , drop = FALSE]), - na.rm = TRUE) + na.rm = TRUE + ) } overall <- mean(unlist(per), na.rm = TRUE) } - list(estimate = overall, per_party = per, n = n, m = m, - method = "rice_cohesion") + list( + estimate = overall, per_party = per, n = n, m = m, + method = "rice_cohesion" + ) } #' @keywords internal #' @rdname algnm #' @export -party_alignment <- algnm +morie_party_alignment <- algnm diff --git a/r-package/morie/R/aniso.R b/r-package/morie/R/aniso.R index 450d6cc845..8590c0c2e8 100644 --- a/r-package/morie/R/aniso.R +++ b/r-package/morie/R/aniso.R @@ -14,55 +14,65 @@ #' directions_deg, n, method. #' @references Goovaerts (1997); Schabenberger & Gotway (2005), Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_aniso(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export -aniso <- function(x, coords, n_dirs = 4, tol_deg = 22.5) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else +morie_aniso <- function(x, coords, n_dirs = 4, tol_deg = 22.5) { + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } if (nrow(coords) != n) stop("coords rows must match length(x)") if (ncol(coords) == 1) { - return(list(statistic = 0, p_value = 1, n = n, - method = "Anisotropy test (1D: trivially isotropic)")) + return(list( + statistic = 0, p_value = 1, n = n, + method = "Anisotropy test (1D: trivially isotropic)" + )) } tol <- tol_deg * pi / 180 angles <- seq(0, pi, length.out = n_dirs + 1)[-(n_dirs + 1)] iu <- which(upper.tri(matrix(0, n, n)), arr.ind = TRUE) dv <- coords[iu[, 2], , drop = FALSE] - coords[iu[, 1], , drop = FALSE] ang <- atan2(dv[, 2], dv[, 1]) %% pi - groups <- list(); means <- numeric(); kept_angles <- numeric() + groups <- list() + means <- numeric() + kept_angles <- numeric() for (a in angles) { diff <- abs(ang - a) diff <- pmin(diff, pi - diff) mask <- diff <= tol if (sum(mask) < 2) next - d2 <- (x[iu[mask, 1]] - x[iu[mask, 2]]) ^ 2 + d2 <- (x[iu[mask, 1]] - x[iu[mask, 2]])^2 groups[[length(groups) + 1]] <- d2 means <- c(means, 0.5 * mean(d2)) kept_angles <- c(kept_angles, a * 180 / pi) } if (length(groups) < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "Anisotropy test (insufficient pairs)")) + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "Anisotropy test (insufficient pairs)" + )) } # Levene's test using base R via group-wise abs-from-median med_all <- vapply(groups, stats::median, numeric(1)) abs_dev <- mapply(function(g, m) abs(g - m), groups, med_all, - SIMPLIFY = FALSE) + SIMPLIFY = FALSE + ) combined <- unlist(abs_dev) grp_idx <- rep(seq_along(abs_dev), vapply(abs_dev, length, integer(1))) fit <- stats::aov(combined ~ factor(grp_idx)) s <- summary(fit)[[1]] - list(statistic = s$`F value`[1], p_value = s$`Pr(>F)`[1], - directional_gamma = means, directions_deg = kept_angles, - n = n, - method = sprintf("Anisotropy test (Levene across %d directions)", n_dirs)) + list( + statistic = s$`F value`[1], p_value = s$`Pr(>F)`[1], + directional_gamma = means, directions_deg = kept_angles, + n = n, + method = sprintf("Anisotropy test (Levene across %d directions)", n_dirs) + ) } -#' @rdname aniso +#' @rdname morie_aniso #' @keywords internal #' @export -anisotropy_test <- aniso +morie_anisotropy_test <- morie_aniso diff --git a/r-package/morie/R/antth.R b/r-package/morie/R/antth.R index 694fe4dfd1..ab3b585625 100644 --- a/r-package/morie/R/antth.R +++ b/r-package/morie/R/antth.R @@ -14,26 +14,31 @@ antth <- function(x = NULL, f = NULL, N = 1000L, seed = 42L) { if (is.null(f)) f <- function(u) u set.seed(seed) - if (is.null(x)) u <- stats::runif(N) else { + if (is.null(x)) { + u <- stats::runif(N) + } else { u <- as.numeric(x) - if (min(u) < 0 || max(u) > 1) + if (min(u) < 0 || max(u) > 1) { u <- (rank(u)) / (length(u) + 1) + } } n <- length(u) fu <- vapply(u, f, numeric(1)) fu_anti <- vapply(1 - u, f, numeric(1)) paired <- 0.5 * (fu + fu_anti) est_av <- mean(paired) - se_av <- stats::sd(paired) / sqrt(n) + se_av <- stats::sd(paired) / sqrt(n) est_crude <- mean(fu) var_crude <- stats::var(fu) / n ratio <- if (var_crude > 0) se_av^2 / var_crude else NA_real_ - list(estimate = as.numeric(est_av), - estimate_crude = as.numeric(est_crude), - se = as.numeric(se_av), - var_ratio_av_over_crude = as.numeric(ratio), - n_pairs = as.integer(n), - method = "Antithetic variates (Hammersley & Morton 1956)") + list( + estimate = as.numeric(est_av), + estimate_crude = as.numeric(est_crude), + se = as.numeric(se_av), + var_ratio_av_over_crude = as.numeric(ratio), + n_pairs = as.integer(n), + method = "Antithetic variates (Hammersley & Morton 1956)" + ) } # CANONICAL TEST @@ -44,4 +49,4 @@ antth <- function(x = NULL, f = NULL, N = 1000L, seed = 42L) { #' @rdname antth #' @keywords internal #' @export -antithetic_variates <- antth +morie_antithetic_variates <- antth diff --git a/r-package/morie/R/archm.R b/r-package/morie/R/archm.R index bf43484e63..a450dc19d6 100644 --- a/r-package/morie/R/archm.R +++ b/r-package/morie/R/archm.R @@ -1,44 +1,61 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: ARCH(1)-in-mean negative log-likelihood. Extracted from the +# morie_arch_in_mean() optimiser closure so the parameter-domain guard is +# directly unit-testable. `y` is the series, `n` its length. +.archm_negll <- function(p, y, n) { + mu <- p[1] + delta <- p[2] + omega <- p[3] + alpha <- p[4] + if (omega <= 0 || alpha < 0 || alpha >= 0.999) { + return(1e10) + } + s2 <- numeric(n) + s2[1] <- max(var(y), 1e-10) + eps <- numeric(n) + eps[1] <- y[1] - mu - delta * sqrt(s2[1]) + for (t in 2:n) { + s2[t] <- max(omega + alpha * eps[t - 1]^2, 1e-12) + eps[t] <- y[t] - mu - delta * sqrt(s2[t]) + } + 0.5 * sum(log(2 * pi * s2) + eps^2 / s2) +} + #' ARCH(1)-in-mean model #' -#' @inheritParams garch_fit +#' @inheritParams morie_garch_fit #' @return Named list with \code{mu, delta, omega, alpha, loglik, #' conditional_variance, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_arch_in_mean(x = rnorm(50)) #' @export -arch_in_mean <- function(x) { - y <- as.numeric(x); n <- length(y) +morie_arch_in_mean <- function(x) { + y <- as.numeric(x) + n <- length(y) if (n < 20) stop("Need >=20 obs.") - neg_ll <- function(p) { - mu <- p[1]; delta <- p[2]; omega <- p[3]; alpha <- p[4] - if (omega <= 0 || alpha < 0 || alpha >= 0.999) return(1e10) - s2 <- numeric(n); s2[1] <- max(var(y), 1e-10) - eps <- numeric(n); eps[1] <- y[1] - mu - delta * sqrt(s2[1]) - for (t in 2:n) { - s2[t] <- max(omega + alpha * eps[t - 1]^2, 1e-12) - eps[t] <- y[t] - mu - delta * sqrt(s2[t]) - } - 0.5 * sum(log(2 * pi * s2) + eps^2 / s2) - } + neg_ll <- function(p) .archm_negll(p, y, n) var_y <- var(y) opt <- nlminb(c(mean(y), 0, var_y * 0.5, 0.2), neg_ll, - lower = c(-10, -10, 1e-8, 1e-8), - upper = c(10, 10, var_y * 10, 0.999)) - mu <- opt$par[1]; delta <- opt$par[2] - omega <- opt$par[3]; alpha <- opt$par[4] - s2 <- numeric(n); s2[1] <- var_y - eps <- numeric(n); eps[1] <- y[1] - mu - delta * sqrt(s2[1]) + lower = c(-10, -10, 1e-8, 1e-8), + upper = c(10, 10, var_y * 10, 0.999) + ) + mu <- opt$par[1] + delta <- opt$par[2] + omega <- opt$par[3] + alpha <- opt$par[4] + s2 <- numeric(n) + s2[1] <- var_y + eps <- numeric(n) + eps[1] <- y[1] - mu - delta * sqrt(s2[1]) for (t in 2:n) { s2[t] <- omega + alpha * eps[t - 1]^2 eps[t] <- y[t] - mu - delta * sqrt(s2[t]) } - list(mu = mu, delta = delta, omega = omega, alpha = alpha, - loglik = -opt$objective, - conditional_variance = s2, n = n, - method = "ARCH(1)-in-mean Gaussian MLE (base R)") + list( + mu = mu, delta = delta, omega = omega, alpha = alpha, + loglik = -opt$objective, + conditional_variance = s2, n = n, + method = "ARCH(1)-in-mean Gaussian MLE (base R)" + ) } diff --git a/r-package/morie/R/attnq.R b/r-package/morie/R/attnq.R index aaf623b699..a5c028123f 100644 --- a/r-package/morie/R/attnq.R +++ b/r-package/morie/R/attnq.R @@ -14,12 +14,9 @@ #' @return Named list \code{(output, estimate, attn, logits, d_k, method)}. #' @references Vaswani et al. (2017), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_attnq_scaled_dot_product_attention(Q = matrix(rnorm(150), 50, 3)) #' @export -attnq_scaled_dot_product_attention <- function(Q, K = NULL, V = NULL, +morie_attnq_scaled_dot_product_attention <- function(Q, K = NULL, V = NULL, mask = NULL) { Q <- as.matrix(Q) if (is.null(K)) K <- Q else K <- as.matrix(K) @@ -32,12 +29,14 @@ attnq_scaled_dot_product_attention <- function(Q, K = NULL, V = NULL, e <- exp(sweep(logits, 1L, m, "-")) attn <- sweep(e, 1L, rowSums(e), "/") out <- attn %*% V - list(output = out, estimate = out, attn = attn, logits = logits, - d_k = as.integer(d_k), - method = "Scaled dot-product attention") + list( + output = out, estimate = out, attn = attn, logits = logits, + d_k = as.integer(d_k), + method = "Scaled dot-product attention" + ) } -#' @rdname attnq_scaled_dot_product_attention +#' @rdname morie_attnq_scaled_dot_product_attention #' @keywords internal #' @export -scaled_dot_product_attention <- attnq_scaled_dot_product_attention +morie_scaled_dot_product_attention <- morie_attnq_scaled_dot_product_attention diff --git a/r-package/morie/R/bglup.R b/r-package/morie/R/bglup.R index 717cbd43a0..32f47d7623 100644 --- a/r-package/morie/R/bglup.R +++ b/r-package/morie/R/bglup.R @@ -15,31 +15,40 @@ #' @return list(estimate, beta, beta_pip, pi, sigma_b2, sigma2, n_iter, n, p, method). #' @references Habier-Fernando-Kizilkaya-Garrick (2011); Montesinos Lopez Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bayes_cpi_genomic(x = rnorm(50), y = rnorm(50)) #' @export -bayes_cpi_genomic <- function(x, y, n_iter = 300, burn = 100, - pi_init = 0.1, seed = 0, - deterministic_seed = NULL) { +morie_bayes_cpi_genomic <- function(x, y, n_iter = 300, burn = 100, + pi_init = 0.1, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("bglup", deterministic_seed) } else { set.seed(seed) } - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) - ym <- mean(y); yc <- y - ym + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) + ym <- mean(y) + yc <- y - ym Xc <- sweep(X, 2, colMeans(X)) var_y <- if (n > 1) stats::var(yc) else 1 - sigma_b2 <- var_y / max(p, 1); sigma2 <- var_y + sigma_b2 <- var_y / max(p, 1) + sigma2 <- var_y pi_in <- as.numeric(pi_init) - df_b <- 4; S_b <- max(sigma_b2 * (df_b - 2) / df_b, 1e-6) - df_e <- 4; S_e <- max(var_y * (df_e - 2) / df_e, 1e-6) - beta <- rep(0, p); delta <- rep(0L, p) - r <- yc; xtx_diag <- colSums(Xc^2) - bsum <- matrix(0, 0, p); psum <- matrix(0, 0, p) - piv <- numeric(0); sbv <- numeric(0); sv2 <- numeric(0) + df_b <- 4 + S_b <- max(sigma_b2 * (df_b - 2) / df_b, 1e-6) + df_e <- 4 + S_e <- max(var_y * (df_e - 2) / df_e, 1e-6) + beta <- rep(0, p) + delta <- rep(0L, p) + r <- yc + xtx_diag <- colSums(Xc^2) + bsum <- matrix(0, 0, p) + psum <- matrix(0, 0, p) + piv <- numeric(0) + sbv <- numeric(0) + sv2 <- numeric(0) for (it in seq_len(n_iter)) { for (j in seq_len(p)) { xj <- Xc[, j] @@ -49,12 +58,16 @@ bayes_cpi_genomic <- function(x, y, n_iter = 300, burn = 100, log_bf <- 0.5 * log(1 / max(sigma_b2 * v, 1e-30)) + 0.5 * v * mn^2 log_pi <- log(max(pi_in, 1e-30)) log_1mp <- log(max(1 - pi_in, 1e-30)) - log_p1 <- log_pi + log_bf; log_p0 <- log_1mp + log_p1 <- log_pi + log_bf + log_p0 <- log_1mp mx <- max(log_p1, log_p0) prob_in <- exp(log_p1 - mx) / (exp(log_p1 - mx) + exp(log_p0 - mx)) delta[j] <- as.integer(stats::runif(1) < prob_in) - if (delta[j] == 1L) beta[j] <- stats::rnorm(1, mn, 1 / sqrt(v)) - else beta[j] <- 0 + if (delta[j] == 1L) { + beta[j] <- stats::rnorm(1, mn, 1 / sqrt(v)) + } else { + beta[j] <- 0 + } r <- r_j - xj * beta[j] } k_in <- sum(delta) @@ -62,24 +75,29 @@ bayes_cpi_genomic <- function(x, y, n_iter = 300, burn = 100, df_post <- df_b + max(k_in, 1) scale_post <- (S_b * df_b + sum(beta[delta == 1L]^2)) / df_post sigma_b2 <- max(scale_post * df_post / - max(stats::rchisq(1, df_post), 1e-8), 1e-12) + max(stats::rchisq(1, df_post), 1e-8), 1e-12) df_post_e <- df_e + n scale_post_e <- (S_e * df_e + sum(r^2)) / df_post_e sigma2 <- max(scale_post_e * df_post_e / - max(stats::rchisq(1, df_post_e), 1e-8), 1e-12) + max(stats::rchisq(1, df_post_e), 1e-8), 1e-12) if (it > burn) { - bsum <- rbind(bsum, beta); psum <- rbind(psum, delta) - piv <- c(piv, pi_in); sbv <- c(sbv, sigma_b2); sv2 <- c(sv2, sigma2) + bsum <- rbind(bsum, beta) + psum <- rbind(psum, delta) + piv <- c(piv, pi_in) + sbv <- c(sbv, sigma_b2) + sv2 <- c(sv2, sigma2) } } beta_hat <- colMeans(bsum) pip <- colMeans(psum) - list(estimate = mean(abs(beta_hat)), beta = beta_hat, beta_pip = pip, - pi = mean(piv), sigma_b2 = mean(sbv), sigma2 = mean(sv2), - intercept = ym, n_iter = length(sv2), n = n, p = p, - method = "BayesC-pi short Gibbs") + list( + estimate = mean(abs(beta_hat)), beta = beta_hat, beta_pip = pip, + pi = mean(piv), sigma_b2 = mean(sbv), sigma2 = mean(sv2), + intercept = ym, n_iter = length(sv2), n = n, p = p, + method = "BayesC-pi short Gibbs" + ) } # CANONICAL TEST # set.seed(11); X <- matrix(rnorm(180), 30, 6); b <- c(1,0,0,-1,0,0) -# y <- X %*% b + 0.1*rnorm(30); bayes_cpi_genomic(X, y, seed=11)$beta_pip +# y <- X %*% b + 0.1*rnorm(30); morie_bayes_cpi_genomic(X, y, seed=11)$beta_pip diff --git a/r-package/morie/R/bkprp.R b/r-package/morie/R/bkprp.R index cf65805799..35a32c1b19 100644 --- a/r-package/morie/R/bkprp.R +++ b/r-package/morie/R/bkprp.R @@ -18,18 +18,18 @@ #' @return Named list \code{(loss, estimate, dW, db, dx, a, z, method)}. #' @references Rumelhart, Hinton & Williams (1986); Goodfellow et al. (2016). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bkprp_backpropagation(x = rnorm(50), y = rnorm(50)) #' @export -bkprp_backpropagation <- function(x, y, w = NULL, b = NULL, +morie_bkprp_backpropagation <- function(x, y, w = NULL, b = NULL, activation = "sigmoid") { - x <- as.matrix(x); y <- as.matrix(y) - n_in <- ncol(x); n_out <- ncol(y) + x <- as.matrix(x) + y <- as.matrix(y) + n_in <- ncol(x) + n_out <- ncol(y) if (is.null(w)) w <- diag(1, n_out, n_in) if (is.null(b)) b <- rep(0, n_out) - w <- as.matrix(w); b <- as.numeric(b) + w <- as.matrix(w) + b <- as.numeric(b) z <- sweep(x %*% t(w), 2L, b, "+") a <- .bkprp_sigma(z, activation) dsig <- .bkprp_sigma_prime(z, activation, a) @@ -40,13 +40,17 @@ bkprp_backpropagation <- function(x, y, w = NULL, b = NULL, dW <- t(delta) %*% x / batch db <- colSums(delta) / batch dx <- delta %*% w / batch - list(loss = loss, estimate = loss, dW = dW, db = db, dx = dx, a = a, z = z, - method = "Backpropagation gradient computation") + list( + loss = loss, estimate = loss, dW = dW, db = db, dx = dx, a = a, z = z, + method = "Backpropagation gradient computation" + ) } .bkprp_sigma <- function(z, activation) { switch(activation, - "identity" = z, "linear" = z, "none" = z, + "identity" = z, + "linear" = z, + "none" = z, "sigmoid" = 1 / (1 + exp(-z)), "tanh" = tanh(z), "relu" = pmax(0, z), @@ -66,7 +70,7 @@ bkprp_backpropagation <- function(x, y, w = NULL, b = NULL, ) } -#' @rdname bkprp_backpropagation +#' @rdname morie_bkprp_backpropagation #' @keywords internal #' @export -backpropagation <- bkprp_backpropagation +morie_backpropagation <- morie_bkprp_backpropagation diff --git a/r-package/morie/R/blasf.R b/r-package/morie/R/blasf.R index 67b08fefb5..4f4ac649ce 100644 --- a/r-package/morie/R/blasf.R +++ b/r-package/morie/R/blasf.R @@ -15,28 +15,36 @@ #' @return list(estimate, beta, intercept, se, beta_se, lam, sigma2, n_iter, n, p, method). #' @references Park & Casella (2008) JASA 103:681. Montesinos Lopez Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bayesian_lasso_full( +#' x = matrix(rnorm(150), 50, 3), y = rnorm(50), +#' n_iter = 50L, burn = 10L, lam = 1, seed = 1L, +#' deterministic_seed = TRUE +#' ) #' @export -bayesian_lasso_full <- function(x, y, n_iter = 200, burn = 50, - lam = NULL, seed = 0, - deterministic_seed = NULL) { +morie_bayesian_lasso_full <- function(x, y, n_iter = 200, burn = 50, + lam = NULL, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("blasf", deterministic_seed) } else { set.seed(seed) } - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) - ym <- mean(y); yc <- y - ym + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) + ym <- mean(y) + yc <- y - ym Xc <- sweep(X, 2, colMeans(X)) beta <- rep(0, p) sigma2 <- if (n > 1) stats::var(yc) else 1 tau2 <- rep(1, p) lam_val <- if (is.null(lam)) 1 else as.numeric(lam) - XtX <- crossprod(Xc); Xty <- crossprod(Xc, yc) - bsum <- matrix(0, 0, p); ssum <- numeric(0); lsum <- numeric(0) + XtX <- crossprod(Xc) + Xty <- crossprod(Xc, yc) + bsum <- matrix(0, 0, p) + ssum <- numeric(0) + lsum <- numeric(0) for (it in seq_len(n_iter)) { Dinv <- diag(1 / tau2) A <- XtX + Dinv @@ -52,7 +60,7 @@ bayesian_lasso_full <- function(x, y, n_iter = 200, burn = 50, u <- stats::rchisq(p, 1) y_ig <- mu_pr + (mu_pr^2 * u) / (2 * lam_pr) - (mu_pr / (2 * lam_pr)) * - sqrt(4 * mu_pr * lam_pr * u + mu_pr^2 * u^2) + sqrt(4 * mu_pr * lam_pr * u + mu_pr^2 * u^2) z2 <- stats::runif(p) x_ig <- ifelse(z2 <= mu_pr / (mu_pr + y_ig), y_ig, mu_pr^2 / y_ig) x_ig <- pmax(x_ig, 1e-8) @@ -68,18 +76,22 @@ bayesian_lasso_full <- function(x, y, n_iter = 200, burn = 50, lam_val <- sqrt(max(lam2, 1e-8)) } if (it > burn) { - bsum <- rbind(bsum, beta); ssum <- c(ssum, sigma2); lsum <- c(lsum, lam_val) + bsum <- rbind(bsum, beta) + ssum <- c(ssum, sigma2) + lsum <- c(lsum, lam_val) } } beta_hat <- colMeans(bsum) beta_se <- if (nrow(bsum) > 1) apply(bsum, 2, stats::sd) else rep(0, p) - list(estimate = mean(abs(beta_hat)), beta = beta_hat, intercept = ym, - se = mean(beta_se), beta_se = beta_se, - lam = mean(lsum), sigma2 = mean(ssum), - n_iter = length(ssum), n = n, p = p, - method = "Bayesian LASSO (Park-Casella short Gibbs)") + list( + estimate = mean(abs(beta_hat)), beta = beta_hat, intercept = ym, + se = mean(beta_se), beta_se = beta_se, + lam = mean(lsum), sigma2 = mean(ssum), + n_iter = length(ssum), n = n, p = p, + method = "Bayesian LASSO (Park-Casella short Gibbs)" + ) } # CANONICAL TEST # set.seed(3); X <- matrix(rnorm(100), 20, 5); b <- c(1,-1,0,0,0) -# y <- X %*% b + 0.2*rnorm(20); bayesian_lasso_full(X, y, seed=3)$beta +# y <- X %*% b + 0.2*rnorm(20); morie_bayesian_lasso_full(X, y, seed=3)$beta diff --git a/r-package/morie/R/bnfwd.R b/r-package/morie/R/bnfwd.R index 88c72ade8e..484d19d938 100644 --- a/r-package/morie/R/bnfwd.R +++ b/r-package/morie/R/bnfwd.R @@ -13,12 +13,9 @@ #' @return Named list \code{(y, estimate, x_hat, mu, var, eps, method)}. #' @references Ioffe & Szegedy (2015), ICML. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bnfwd_batch_norm_forward(x = rnorm(50)) #' @export -bnfwd_batch_norm_forward <- function(x, gamma = NULL, beta = NULL, +morie_bnfwd_batch_norm_forward <- function(x, gamma = NULL, beta = NULL, eps = 1e-5) { x <- as.matrix(x) # axis=0 (batch) -> per-feature stats @@ -30,11 +27,13 @@ bnfwd_batch_norm_forward <- function(x, gamma = NULL, beta = NULL, x_hat <- sweep(x_hat, 2L, sqrt(var + eps), "/") y <- sweep(x_hat, 2L, gamma, "*") y <- sweep(y, 2L, beta, "+") - list(y = y, estimate = y, x_hat = x_hat, mu = mu, var = var, eps = eps, - method = "Batch normalization forward") + list( + y = y, estimate = y, x_hat = x_hat, mu = mu, var = var, eps = eps, + method = "Batch normalization forward" + ) } -#' @rdname bnfwd_batch_norm_forward +#' @rdname morie_bnfwd_batch_norm_forward #' @keywords internal #' @export -batch_norm_forward <- bnfwd_batch_norm_forward +morie_batch_norm_forward <- morie_bnfwd_batch_norm_forward diff --git a/r-package/morie/R/bpblm.R b/r-package/morie/R/bpblm.R index f176e89086..f32d53278e 100644 --- a/r-package/morie/R/bpblm.R +++ b/r-package/morie/R/bpblm.R @@ -9,9 +9,12 @@ bits_per_byte <- function(x, n_bytes = NULL) { nll <- as.numeric(x) if (!length(nll)) stop("Need at least one token NLL") - total <- sum(nll); nb <- if (is.null(n_bytes)) length(nll) else as.integer(n_bytes) + total <- sum(nll) + nb <- if (is.null(n_bytes)) length(nll) else as.integer(n_bytes) if (nb <= 0) stop("n_bytes must be > 0") - list(value = total / (nb * log(2)), - nll_nats = total, n_tokens = length(nll), n_bytes = nb, - method = "BPB") + list( + value = total / (nb * log(2)), + nll_nats = total, n_tokens = length(nll), n_bytes = nb, + method = "BPB" + ) } diff --git a/r-package/morie/R/brdgf.R b/r-package/morie/R/brdgf.R index b41af43f9f..9fa18d5de3 100644 --- a/r-package/morie/R/brdgf.R +++ b/r-package/morie/R/brdgf.R @@ -18,21 +18,22 @@ #' @return list(estimate, beta, beta_se, sigma_j2, sigma2, n_iter, n, p, method). #' @references Meuwissen-Hayes-Goddard (2001) Genetics 157:1819. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bayes_ridge_gibbs(x = rnorm(50), y = rnorm(50)) #' @export -bayes_ridge_gibbs <- function(x, y, n_iter = 200, burn = 50, - df0 = 4, S0 = NULL, seed = 0, - deterministic_seed = NULL) { +morie_bayes_ridge_gibbs <- function(x, y, n_iter = 200, burn = 50, + df0 = 4, S0 = NULL, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("brdgf", deterministic_seed) } else { set.seed(seed) } - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) - ym <- mean(y); yc <- y - ym + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) + ym <- mean(y) + yc <- y - ym Xc <- sweep(X, 2, colMeans(X)) var_y <- if (n > 1) stats::var(yc) else 1 if (is.null(S0)) S0 <- max((var_y / max(p, 1)) * (df0 - 2) / df0, 1e-6) @@ -41,7 +42,9 @@ bayes_ridge_gibbs <- function(x, y, n_iter = 200, burn = 50, sigma2 <- var_y xtx_diag <- colSums(Xc^2) resid <- yc - as.numeric(Xc %*% beta) - bsum <- matrix(0, 0, p); sj_sum <- matrix(0, 0, p); ssum <- numeric(0) + bsum <- matrix(0, 0, p) + sj_sum <- matrix(0, 0, p) + ssum <- numeric(0) for (it in seq_len(n_iter)) { for (j in seq_len(p)) { xj <- Xc[, j] @@ -55,26 +58,30 @@ bayes_ridge_gibbs <- function(x, y, n_iter = 200, burn = 50, scale_post <- (S0 * df0 + beta^2) / df_post chi2 <- stats::rchisq(p, df_post) sigma_j2 <- pmax(scale_post * df_post / pmax(chi2, 1e-8), 1e-12) - df_e <- 4; Se <- var_y * (df_e - 2) / df_e + df_e <- 4 + Se <- var_y * (df_e - 2) / df_e df_post_e <- n + df_e scale_post_e <- (sum(resid^2) + df_e * Se) / df_post_e sigma2 <- max(scale_post_e * df_post_e / - max(stats::rchisq(1, df_post_e), 1e-8), 1e-12) + max(stats::rchisq(1, df_post_e), 1e-8), 1e-12) if (it > burn) { - bsum <- rbind(bsum, beta); sj_sum <- rbind(sj_sum, sigma_j2) + bsum <- rbind(bsum, beta) + sj_sum <- rbind(sj_sum, sigma_j2) ssum <- c(ssum, sigma2) } } beta_hat <- colMeans(bsum) beta_se <- if (nrow(bsum) > 1) apply(bsum, 2, stats::sd) else rep(0, p) sigma_j2_hat <- colMeans(sj_sum) - list(estimate = mean(abs(beta_hat)), beta = beta_hat, beta_se = beta_se, - se = mean(beta_se), sigma_j2 = sigma_j2_hat, - sigma2 = mean(ssum), intercept = ym, - n_iter = length(ssum), n = n, p = p, - method = "BayesA short Gibbs (Meuwissen-Hayes-Goddard)") + list( + estimate = mean(abs(beta_hat)), beta = beta_hat, beta_se = beta_se, + se = mean(beta_se), sigma_j2 = sigma_j2_hat, + sigma2 = mean(ssum), intercept = ym, + n_iter = length(ssum), n = n, p = p, + method = "BayesA short Gibbs (Meuwissen-Hayes-Goddard)" + ) } # CANONICAL TEST # set.seed(4); X <- matrix(rnorm(100), 20, 5); b <- c(1,-1,0.5,0,0) -# y <- X %*% b + 0.2*rnorm(20); bayes_ridge_gibbs(X, y, seed=4)$beta +# y <- X %*% b + 0.2*rnorm(20); morie_bayes_ridge_gibbs(X, y, seed=4)$beta diff --git a/r-package/morie/R/brdgr.R b/r-package/morie/R/brdgr.R index f94ca3e360..2d21442020 100644 --- a/r-package/morie/R/brdgr.R +++ b/r-package/morie/R/brdgr.R @@ -10,38 +10,44 @@ #' @return Named list with `n_bridges`, `bridge_ids`, `share`, `n1`, #' `n2`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' brdgr(x = rnorm(50)) #' @export brdgr <- function(x, y = NULL) { if (is.null(y)) { xb <- as.logical(x) - return(list(n_bridges = sum(xb), bridge_ids = which(xb), - share = sum(xb) / max(length(xb), 1L), - n1 = length(xb), n2 = length(xb), - method = "bridge_observations")) + return(list( + n_bridges = sum(xb), bridge_ids = which(xb), + share = sum(xb) / max(length(xb), 1L), + n1 = length(xb), n2 = length(xb), + method = "morie_bridge_observations" + )) } if (!is.matrix(x) && !is.matrix(y)) { - s1 <- unique(x); s2 <- unique(y); common <- sort(intersect(s1, s2)) - return(list(n_bridges = length(common), bridge_ids = common, - share = length(common) / max(length(s1), 1L), - n1 = length(s1), n2 = length(s2), - method = "bridge_observations")) + s1 <- unique(x) + s2 <- unique(y) + common <- sort(intersect(s1, s2)) + return(list( + n_bridges = length(common), bridge_ids = common, + share = length(common) / max(length(s1), 1L), + n1 = length(s1), n2 = length(s2), + method = "morie_bridge_observations" + )) } - if (!is.matrix(x) || !is.matrix(y) || nrow(x) != nrow(y)) + if (!is.matrix(x) || !is.matrix(y) || nrow(x) != nrow(y)) { stop("x and y must be matrices with matching n rows") + } has1 <- rowSums(!is.na(x)) > 0 has2 <- rowSums(!is.na(y)) > 0 bridges <- has1 & has2 - list(n_bridges = sum(bridges), bridge_ids = which(bridges), - share = sum(bridges) / max(nrow(x), 1L), - n1 = sum(has1), n2 = sum(has2), - method = "bridge_observations") + list( + n_bridges = sum(bridges), bridge_ids = which(bridges), + share = sum(bridges) / max(nrow(x), 1L), + n1 = sum(has1), n2 = sum(has2), + method = "morie_bridge_observations" + ) } #' @keywords internal #' @rdname brdgr #' @export -bridge_observations <- brdgr +morie_bridge_observations <- brdgr diff --git a/r-package/morie/R/brreg.R b/r-package/morie/R/brreg.R index 287572a52c..7e11a8e1cc 100644 --- a/r-package/morie/R/brreg.R +++ b/r-package/morie/R/brreg.R @@ -10,15 +10,17 @@ #' @return list(estimate, beta, intercept, se, beta_se, lam, n, p, method). #' @references Montesinos Lopez Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_bayesian_ridge_regression(x = rnorm(50), y = rnorm(50)) #' @export -bayesian_ridge_regression <- function(x, y, lam = NULL) { - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) - ym <- mean(y); yc <- y - ym - xm <- colMeans(X); Xc <- sweep(X, 2, xm) +morie_bayesian_ridge_regression <- function(x, y, lam = NULL) { + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) + ym <- mean(y) + yc <- y - ym + xm <- colMeans(X) + Xc <- sweep(X, 2, xm) if (is.null(lam)) { var_y <- if (n > 1) stats::var(yc) else 1 h2 <- 0.5 @@ -34,11 +36,13 @@ bayesian_ridge_regression <- function(x, y, lam = NULL) { sigma2 <- sum(resid^2) / max(n - 1, 1) cov_beta <- sigma2 * solve(A) beta_se <- sqrt(pmax(diag(cov_beta), 0)) - list(estimate = mean(abs(beta)), beta = beta, intercept = ym, - se = mean(beta_se), beta_se = beta_se, lam = lam, - n = n, p = p, method = "Bayesian ridge (closed-form posterior mode)") + list( + estimate = mean(abs(beta)), beta = beta, intercept = ym, + se = mean(beta_se), beta_se = beta_se, lam = lam, + n = n, p = p, method = "Bayesian ridge (closed-form posterior mode)" + ) } # CANONICAL TEST # set.seed(2); X <- matrix(rnorm(100), 20, 5); b <- c(1,-1,0.5,0,0) -# y <- X %*% b + 0.1*rnorm(20); bayesian_ridge_regression(X, y)$beta +# y <- X %*% b + 0.1*rnorm(20); morie_bayesian_ridge_regression(X, y)$beta diff --git a/r-package/morie/R/btsrp.R b/r-package/morie/R/btsrp.R index 2f76b3ddfa..8b81800fe6 100644 --- a/r-package/morie/R/btsrp.R +++ b/r-package/morie/R/btsrp.R @@ -1,7 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later #' Bootstrap confidence interval (percentile, BCa, studentized) #' -#' R parity of \code{morie.fn.btsrp.bootstrap_ci}. Three methods are +#' R parity of \code{morie.fn.btsrp.morie_bootstrap_ci}. Three methods are #' supported: percentile, BCa (Efron 1987 JASA), and studentized #' (Hall 1988 nested resampling). #' @@ -23,9 +23,11 @@ btsrp <- function(x, statistic = NULL, B = 2000L, alpha = 0.05, x <- as.numeric(x) n <- length(x) if (n < 2L) { - return(list(estimate = NA_real_, se = NA_real_, - ci_lower = NA_real_, ci_upper = NA_real_, - n = n, method = method)) + return(list( + estimate = NA_real_, se = NA_real_, + ci_lower = NA_real_, ci_upper = NA_real_, + n = n, method = method + )) } if (is.null(statistic)) statistic <- mean set.seed(seed) @@ -33,9 +35,12 @@ btsrp <- function(x, statistic = NULL, B = 2000L, alpha = 0.05, boot <- replicate(B, statistic(sample(x, n, replace = TRUE))) se <- stats::sd(boot) if (method == "percentile") { - ci <- stats::quantile(boot, probs = c(alpha / 2, 1 - alpha / 2), - names = FALSE) - lo <- ci[1]; hi <- ci[2] + ci <- stats::quantile(boot, + probs = c(alpha / 2, 1 - alpha / 2), + names = FALSE + ) + lo <- ci[1] + hi <- ci[2] } else if (method == "bca") { z0 <- stats::qnorm(mean(boot < theta_hat)) jk <- vapply(seq_len(n), function(i) statistic(x[-i]), numeric(1)) @@ -48,27 +53,35 @@ btsrp <- function(x, statistic = NULL, B = 2000L, alpha = 0.05, a1 <- stats::pnorm(z0 + (z0 + z_lo) / (1 - a * (z0 + z_lo))) a2 <- stats::pnorm(z0 + (z0 + z_hi) / (1 - a * (z0 + z_hi))) ci <- stats::quantile(boot, probs = c(a1, a2), names = FALSE) - lo <- ci[1]; hi <- ci[2] - } else { # studentized + lo <- ci[1] + hi <- ci[2] + } else { # studentized B2 <- max(50, B %/% 10) t_stars <- numeric(B) for (b in seq_len(B)) { idx <- sample.int(n, n, replace = TRUE) - xb <- x[idx]; theta_b <- statistic(xb) - inner <- replicate(B2, - statistic(sample(xb, n, replace = TRUE))) + xb <- x[idx] + theta_b <- statistic(xb) + inner <- replicate( + B2, + statistic(sample(xb, n, replace = TRUE)) + ) se_b <- stats::sd(inner) t_stars[b] <- if (se_b > 0) (theta_b - theta_hat) / se_b else 0 } - qs <- stats::quantile(t_stars, probs = c(alpha / 2, 1 - alpha / 2), - names = FALSE) + qs <- stats::quantile(t_stars, + probs = c(alpha / 2, 1 - alpha / 2), + names = FALSE + ) lo <- theta_hat - qs[2] * se hi <- theta_hat - qs[1] * se } - list(estimate = as.numeric(theta_hat), se = as.numeric(se), - ci_lower = as.numeric(lo), ci_upper = as.numeric(hi), - alpha = alpha, B = as.integer(B), n = as.integer(n), - method = paste0("Bootstrap CI (", method, ")")) + list( + estimate = as.numeric(theta_hat), se = as.numeric(se), + ci_lower = as.numeric(lo), ci_upper = as.numeric(hi), + alpha = alpha, B = as.integer(B), n = as.integer(n), + method = paste0("Bootstrap CI (", method, ")") + ) } # CANONICAL TEST @@ -80,4 +93,4 @@ btsrp <- function(x, statistic = NULL, B = 2000L, alpha = 0.05, #' @rdname btsrp #' @keywords internal #' @export -bootstrap_ci <- btsrp +morie_bootstrap_ci <- btsrp diff --git a/r-package/morie/R/bysid.R b/r-package/morie/R/bysid.R index 85f158355b..43ef71488a 100644 --- a/r-package/morie/R/bysid.R +++ b/r-package/morie/R/bysid.R @@ -16,10 +16,7 @@ #' @return Named list with `x_mean`, `x_sd`, `x_ci`, `alpha`, `beta`, #' `n_iter`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' bysid(x = rnorm(50)) #' @export bysid <- function(x, n_iter = 400L, burn = 100L, seed = 0L, deterministic_seed = NULL) { @@ -30,19 +27,25 @@ bysid <- function(x, n_iter = 400L, burn = 100L, seed = 0L, set.seed(seed) } M <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) - n <- nrow(M); m <- ncol(M) - if (n < 2L) - return(list(x_mean = rep(NA_real_, n), x_sd = rep(NA_real_, n), - x_ci = matrix(NA_real_, n, 2L), - alpha = rep(NA_real_, m), beta = rep(NA_real_, m), - n_iter = 0L, method = "bayesian_ideal_points")) + n <- nrow(M) + m <- ncol(M) + if (n < 2L) { + return(list( + x_mean = rep(NA_real_, n), x_sd = rep(NA_real_, n), + x_ci = matrix(NA_real_, n, 2L), + alpha = rep(NA_real_, m), beta = rep(NA_real_, m), + n_iter = 0L, method = "morie_bayesian_ideal_points" + )) + } Mc <- M - matrix(colMeans(M, na.rm = TRUE), n, m, byrow = TRUE) Mc[is.na(Mc)] <- 0 sv <- tryCatch(svd(Mc, nu = 1L, nv = 0L), error = function(e) NULL) x_cur <- if (!is.null(sv)) sv$u[, 1] * sv$d[1] else stats::rnorm(n) x_cur <- (x_cur - mean(x_cur)) / (stats::sd(x_cur) + 1e-12) - a_cur <- rep(1, m); b_cur <- rep(0, m) - step_x <- 0.4; step_ab <- 0.3 + a_cur <- rep(1, m) + b_cur <- rep(0, m) + step_x <- 0.4 + step_ab <- 0.3 loglik <- function(xv, av, bv) { Z <- sweep(outer(xv, bv, FUN = "-"), 2L, av, "*") P <- logistic(Z) @@ -50,20 +53,31 @@ bysid <- function(x, n_iter = 400L, burn = 100L, seed = 0L, sum(ifelse(mask, M * log(P + 1e-12) + (1 - M) * log(1 - P + 1e-12), 0)) } ll_cur <- loglik(x_cur, a_cur, b_cur) - samples <- list(); a_samples <- list(); b_samples <- list() + samples <- list() + a_samples <- list() + b_samples <- list() for (t in seq_len(n_iter)) { xp <- x_cur + step_x * stats::rnorm(n) llp <- loglik(xp, a_cur, b_cur) la <- (llp - 0.5 * sum(xp^2)) - (ll_cur - 0.5 * sum(x_cur^2)) - if (log(stats::runif(1)) < la) { x_cur <- xp; ll_cur <- llp } + if (log(stats::runif(1)) < la) { + x_cur <- xp + ll_cur <- llp + } ap <- a_cur + step_ab * stats::rnorm(m) llp <- loglik(x_cur, ap, b_cur) la <- (llp - 0.5 * sum(ap^2) / 25) - (ll_cur - 0.5 * sum(a_cur^2) / 25) - if (log(stats::runif(1)) < la) { a_cur <- ap; ll_cur <- llp } + if (log(stats::runif(1)) < la) { + a_cur <- ap + ll_cur <- llp + } bp <- b_cur + step_ab * stats::rnorm(m) llp <- loglik(x_cur, a_cur, bp) la <- (llp - 0.5 * sum(bp^2) / 25) - (ll_cur - 0.5 * sum(b_cur^2) / 25) - if (log(stats::runif(1)) < la) { b_cur <- bp; ll_cur <- llp } + if (log(stats::runif(1)) < la) { + b_cur <- bp + ll_cur <- llp + } if (t > burn) { xs <- (x_cur - mean(x_cur)) / (stats::sd(x_cur) + 1e-12) samples[[length(samples) + 1L]] <- xs @@ -72,23 +86,30 @@ bysid <- function(x, n_iter = 400L, burn = 100L, seed = 0L, } } if (length(samples) == 0L) { - return(list(x_mean = rep(NA_real_, n), x_sd = rep(NA_real_, n), - x_ci = matrix(NA_real_, n, 2L), - alpha = rep(NA_real_, m), beta = rep(NA_real_, m), - n_iter = n_iter, method = "bayesian_ideal_points")) + return(list( + x_mean = rep(NA_real_, n), x_sd = rep(NA_real_, n), + x_ci = matrix(NA_real_, n, 2L), + alpha = rep(NA_real_, m), beta = rep(NA_real_, m), + n_iter = n_iter, method = "morie_bayesian_ideal_points" + )) } arr <- do.call(rbind, samples) - x_mean <- colMeans(arr); x_sd <- apply(arr, 2L, stats::sd) - x_ci <- t(apply(arr, 2L, - function(z) stats::quantile(z, c(0.025, 0.975)))) + x_mean <- colMeans(arr) + x_sd <- apply(arr, 2L, stats::sd) + x_ci <- t(apply( + arr, 2L, + function(z) stats::quantile(z, c(0.025, 0.975)) + )) a_mean <- colMeans(do.call(rbind, a_samples)) b_mean <- colMeans(do.call(rbind, b_samples)) - list(x_mean = x_mean, x_sd = x_sd, x_ci = x_ci, - alpha = a_mean, beta = b_mean, - n_iter = n_iter, method = "bayesian_ideal_points") + list( + x_mean = x_mean, x_sd = x_sd, x_ci = x_ci, + alpha = a_mean, beta = b_mean, + n_iter = n_iter, method = "morie_bayesian_ideal_points" + ) } #' @keywords internal #' @rdname bysid #' @export -bayesian_ideal_points <- bysid +morie_bayesian_ideal_points <- bysid diff --git a/r-package/morie/R/causal.R b/r-package/morie/R/causal.R index 68cda66ef9..281bdd36d2 100644 --- a/r-package/morie/R/causal.R +++ b/r-package/morie/R/causal.R @@ -49,9 +49,9 @@ NULL #' @return Numeric vector of propensity scores (same length as `nrow(data)`). #' @export #' @examples -#' df <- data.frame(t = c(0,1,0,1,0,1), x = rnorm(6)) -#' ps <- estimate_propensity_scores(df, "t", "x") -estimate_propensity_scores <- function(data, treatment, covariates, +#' df <- data.frame(t = c(0, 1, 0, 1, 0, 1), x = rnorm(6)) +#' ps <- morie_estimate_propensity_scores(df, "t", "x") +morie_estimate_propensity_scores <- function(data, treatment, covariates, trim = c(0.01, 0.99)) { ps <- .fit_propensity(data, treatment, covariates) lo <- stats::quantile(ps, trim[1]) @@ -86,15 +86,15 @@ estimate_propensity_scores <- function(data, treatment, covariates, #' y = rnorm(200), #' x = rnorm(200) #' ) -#' estimate_ate(df, "t", "y", "x") -estimate_ate <- function(data, treatment, outcome, covariates, +#' morie_estimate_ate(df, "t", "y", "x") +morie_estimate_ate <- function(data, treatment, outcome, covariates, propensity_col = NULL) { t <- as.numeric(data[[treatment]]) y <- as.numeric(data[[outcome]]) ps <- if (!is.null(propensity_col)) { .clip_ps(data[[propensity_col]]) } else { - estimate_propensity_scores(data, treatment, covariates) + morie_estimate_propensity_scores(data, treatment, covariates) } w <- t / ps + (1 - t) / (1 - ps) @@ -103,8 +103,10 @@ estimate_ate <- function(data, treatment, outcome, covariates, ci <- .wald_ci(ate, se) ess <- (sum(w)^2) / sum(w^2) - list(ate = ate, se = se, ci_lower = ci[1], ci_upper = ci[2], - n = length(y), ess = ess) + list( + ate = ate, se = se, ci_lower = ci[1], ci_upper = ci[2], + n = length(y), ess = ess + ) } @@ -117,21 +119,21 @@ estimate_ate <- function(data, treatment, outcome, covariates, #' Treated units receive weight 1; controls receive #' \eqn{w_i = \hat{e}(X_i)/(1-\hat{e}(X_i))}. #' -#' @inheritParams estimate_ate +#' @inheritParams morie_estimate_ate #' @return Named list: `att`, `se`, `ci_lower`, `ci_upper`, `n_treated`. #' @export #' @examples #' set.seed(2) #' df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) -#' estimate_att(df, "t", "y", "x") -estimate_att <- function(data, treatment, outcome, covariates, +#' morie_estimate_att(df, "t", "y", "x") +morie_estimate_att <- function(data, treatment, outcome, covariates, propensity_col = NULL) { t <- as.numeric(data[[treatment]]) y <- as.numeric(data[[outcome]]) ps <- if (!is.null(propensity_col)) { .clip_ps(data[[propensity_col]]) } else { - estimate_propensity_scores(data, treatment, covariates) + morie_estimate_propensity_scores(data, treatment, covariates) } # Control weights: e(X) / (1 - e(X)) @@ -143,7 +145,7 @@ estimate_att <- function(data, treatment, outcome, covariates, n1 <- sum(t == 1) # Delta-method SE approximation se <- sqrt(stats::var(y[t == 1]) / n1 + - stats::var(w_ctrl[t == 0] * y[t == 0]) / sum(t == 0)) + stats::var(w_ctrl[t == 0] * y[t == 0]) / sum(t == 0)) ci <- .wald_ci(att, se) list(att = att, se = se, ci_lower = ci[1], ci_upper = ci[2], n_treated = n1) @@ -159,17 +161,21 @@ estimate_att <- function(data, treatment, outcome, covariates, #' Control units receive weight 1; treated units receive #' \eqn{w_i = (1-\hat{e}(X_i))/\hat{e}(X_i)}. #' -#' @inheritParams estimate_ate +#' @inheritParams morie_estimate_ate #' @return Named list: `atc`, `se`, `ci_lower`, `ci_upper`, `n_control`. +#' @examples +#' set.seed(1) +#' df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +#' morie_estimate_atc(df, "t", "y", "x") #' @export -estimate_atc <- function(data, treatment, outcome, covariates, +morie_estimate_atc <- function(data, treatment, outcome, covariates, propensity_col = NULL) { t <- as.numeric(data[[treatment]]) y <- as.numeric(data[[outcome]]) ps <- if (!is.null(propensity_col)) { .clip_ps(data[[propensity_col]]) } else { - estimate_propensity_scores(data, treatment, covariates) + morie_estimate_propensity_scores(data, treatment, covariates) } w_trt <- (1 - ps) / ps @@ -179,7 +185,7 @@ estimate_atc <- function(data, treatment, outcome, covariates, n0 <- sum(t == 0) se <- sqrt(stats::var(y[t == 0]) / n0 + - stats::var(w_trt[t == 1] * y[t == 1]) / sum(t == 1)) + stats::var(w_trt[t == 1] * y[t == 1]) / sum(t == 1)) ci <- .wald_ci(atc, se) list(atc = atc, se = se, ci_lower = ci[1], ci_upper = ci[2], n_control = n0) @@ -196,11 +202,15 @@ estimate_atc <- function(data, treatment, outcome, covariates, #' **either** the propensity model **or** the outcome model is correctly #' specified. #' -#' @inheritParams estimate_ate +#' @inheritParams morie_estimate_ate #' @param outcome_model Family for the outcome model: `"linear"` or `"logistic"`. #' @return Named list: `ate`, `se`, `ci_lower`, `ci_upper`, `n`. +#' @examples +#' set.seed(1) +#' df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +#' morie_estimate_aipw(df, "t", "y", "x") #' @export -estimate_aipw <- function(data, treatment, outcome, covariates, +morie_estimate_aipw <- function(data, treatment, outcome, covariates, propensity_col = NULL, outcome_model = c("linear", "logistic")) { outcome_model <- match.arg(outcome_model) @@ -209,7 +219,7 @@ estimate_aipw <- function(data, treatment, outcome, covariates, ps <- if (!is.null(propensity_col)) { .clip_ps(data[[propensity_col]]) } else { - estimate_propensity_scores(data, treatment, covariates) + morie_estimate_propensity_scores(data, treatment, covariates) } fam <- if (outcome_model == "logistic") stats::binomial() else stats::gaussian() @@ -217,8 +227,10 @@ estimate_aipw <- function(data, treatment, outcome, covariates, paste(outcome, "~", paste(c(treatment, covariates), collapse = " + ")) ) fit <- stats::glm(formula, data = data, family = fam) - data1 <- data; data1[[treatment]] <- 1 - data0 <- data; data0[[treatment]] <- 0 + data1 <- data + data1[[treatment]] <- 1 + data0 <- data + data0[[treatment]] <- 0 mu1 <- as.numeric(stats::predict(fit, newdata = data1, type = "response")) mu0 <- as.numeric(stats::predict(fit, newdata = data0, type = "response")) @@ -240,7 +252,7 @@ estimate_aipw <- function(data, treatment, outcome, covariates, #' Applies AIPW within each level of `group_col` to estimate #' stratum-specific treatment effects. #' -#' @inheritParams estimate_aipw +#' @inheritParams morie_estimate_aipw #' @param group_col Name of the grouping variable (e.g. `"gender"`). #' @return Data frame with columns: `group`, `ate`, `se`, #' `ci_lower`, `ci_upper`, `n`. @@ -253,8 +265,8 @@ estimate_aipw <- function(data, treatment, outcome, covariates, #' x = rnorm(300), #' g = sample(c("A", "B"), 300, replace = TRUE) #' ) -#' estimate_gate(df, "t", "y", "x", "g") -estimate_gate <- function(data, treatment, outcome, covariates, +#' morie_estimate_gate(df, "t", "y", "x", "g") +morie_estimate_gate <- function(data, treatment, outcome, covariates, group_col, propensity_col = NULL, outcome_model = c("linear", "logistic")) { outcome_model <- match.arg(outcome_model) @@ -272,11 +284,16 @@ estimate_gate <- function(data, treatment, outcome, covariates, next } est <- tryCatch( - estimate_aipw(sub, treatment, outcome, covariates, - propensity_col = propensity_col, - outcome_model = outcome_model), - error = function(e) list(ate = NA_real_, se = NA_real_, - ci_lower = NA_real_, ci_upper = NA_real_) + morie_estimate_aipw(sub, treatment, outcome, covariates, + propensity_col = propensity_col, + outcome_model = outcome_model + ), + error = function(e) { + list( + ate = NA_real_, se = NA_real_, + ci_lower = NA_real_, ci_upper = NA_real_ + ) + } ) results[[i]] <- data.frame( group = g, ate = est$ate, se = est$se, @@ -299,21 +316,25 @@ estimate_gate <- function(data, treatment, outcome, covariates, #' #' The **S-learner** fits one model with treatment as a feature. #' -#' @inheritParams estimate_aipw +#' @inheritParams morie_estimate_aipw #' @param meta_learner `"t_learner"` (default) or `"s_learner"`. #' @return Numeric vector of per-unit CATE estimates. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_estimate_cate( +#' data = data.frame( +#' t = stats::rbinom(100, 1, 0.4), +#' y = stats::rbinom(100, 1, 0.3), x1 = stats::rnorm(100), +#' x2 = stats::rnorm(100) +#' ), treatment = "t", outcome = "y", +#' covariates = c("x1", "x2") +#' ) #' @export -estimate_cate <- function(data, treatment, outcome, covariates, +morie_estimate_cate <- function(data, treatment, outcome, covariates, propensity_col = NULL, outcome_model = c("linear", "logistic"), meta_learner = c("t_learner", "s_learner")) { outcome_model <- match.arg(outcome_model) - meta_learner <- match.arg(meta_learner) + meta_learner <- match.arg(meta_learner) fam <- if (outcome_model == "logistic") stats::binomial() else stats::gaussian() t <- as.numeric(data[[treatment]]) @@ -330,8 +351,10 @@ estimate_cate <- function(data, treatment, outcome, covariates, paste(outcome, "~", paste(c(treatment, covariates), collapse = " + ")) ) fit <- stats::glm(formula_s, data = data, family = fam) - data1 <- data; data1[[treatment]] <- 1 - data0 <- data; data0[[treatment]] <- 0 + data1 <- data + data1[[treatment]] <- 1 + data0 <- data + data0[[treatment]] <- 0 mu1 <- as.numeric(stats::predict(fit, newdata = data1, type = "response")) mu0 <- as.numeric(stats::predict(fit, newdata = data0, type = "response")) } @@ -364,7 +387,14 @@ estimate_cate <- function(data, treatment, outcome, covariates, #' @references #' Imbens GW, Angrist JD (1994). Identification and estimation of local #' average treatment effects. *Econometrica*, 62(2), 467-475. -estimate_late <- function(data, treatment, outcome, instrument, +#' @examples +#' set.seed(1) +#' n <- 300L +#' z <- rbinom(n, 1, 0.5) +#' t <- rbinom(n, 1, plogis(-0.2 + 1.5 * z)) +#' y <- 0.8 * t + rnorm(n) +#' morie_estimate_late(data.frame(t = t, y = y, z = z), "t", "y", "z") +morie_estimate_late <- function(data, treatment, outcome, instrument, covariates = NULL) { t <- as.numeric(data[[treatment]]) y <- as.numeric(data[[outcome]]) @@ -372,8 +402,10 @@ estimate_late <- function(data, treatment, outcome, instrument, # First-stage F statistic (strength of instrument) fs_formula <- stats::as.formula( - paste(treatment, "~", instrument, - if (!is.null(covariates)) paste("+", paste(covariates, collapse = " + ")) else "") + paste( + treatment, "~", instrument, + if (!is.null(covariates)) paste("+", paste(covariates, collapse = " + ")) else "" + ) ) fs_fit <- stats::lm(fs_formula, data = data) fs_f <- summary(fs_fit)$fstatistic[1] @@ -397,10 +429,11 @@ estimate_late <- function(data, treatment, outcome, instrument, ) if (requireNamespace("ivreg", quietly = TRUE)) { fit_iv <- ivreg::ivreg( - stats::as.formula(iv_formula_str), data = data + stats::as.formula(iv_formula_str), + data = data ) late <- stats::coef(fit_iv)[treatment] - se <- sqrt(stats::vcov(fit_iv)[treatment, treatment]) + se <- sqrt(stats::vcov(fit_iv)[treatment, treatment]) } else { # Fallback: manual 2SLS t_hat <- stats::fitted(fs_fit) @@ -408,17 +441,22 @@ estimate_late <- function(data, treatment, outcome, instrument, data2[[paste0(treatment, "_hat")]] <- t_hat rhs2 <- paste(c(paste0(treatment, "_hat"), covariates), collapse = " + ") ss_fit <- stats::lm( - stats::as.formula(paste(outcome, "~", rhs2)), data = data2 + stats::as.formula(paste(outcome, "~", rhs2)), + data = data2 ) late <- stats::coef(ss_fit)[paste0(treatment, "_hat")] - se <- sqrt(stats::vcov(ss_fit)[paste0(treatment, "_hat"), - paste0(treatment, "_hat")]) + se <- sqrt(stats::vcov(ss_fit)[ + paste0(treatment, "_hat"), + paste0(treatment, "_hat") + ]) } } ci <- .wald_ci(late, se) - list(late = late, se = se, ci_lower = ci[1], ci_upper = ci[2], - first_stage_f = as.numeric(fs_f), n = length(y)) + list( + late = late, se = se, ci_lower = ci[1], ci_upper = ci[2], + first_stage_f = as.numeric(fs_f), n = length(y) + ) } @@ -436,19 +474,19 @@ estimate_late <- function(data, treatment, outcome, instrument, #' #' @param rr Risk ratio estimate (> 0). Supply > 1; if < 1, pass its reciprocal. #' @param rr_lower Lower bound of the 95% CI (used to compute E-value for CI). -#' @return Named list: `e_value`, `e_value_ci` (for the CI bound). +#' @return Named list: `morie_e_value`, `e_value_ci` (for the CI bound). #' @export #' @references #' VanderWeele TJ, Ding P (2017). Sensitivity analysis in observational #' research: introducing the E-value. *Annals of Internal Medicine*, #' 167(4):268-274. #' @examples -#' e_value(rr = 3.9, rr_lower = 2.4) -e_value <- function(rr, rr_lower = NULL) { +#' morie_e_value(rr = 3.9, rr_lower = 2.4) +morie_e_value <- function(rr, rr_lower = NULL) { compute_e <- function(r) r + sqrt(r * (r - 1)) ev <- compute_e(rr) ev_ci <- if (!is.null(rr_lower)) compute_e(rr_lower) else NA_real_ - list(e_value = ev, e_value_ci = ev_ci) + list(morie_e_value = ev, e_value_ci = ev_ci) } @@ -471,14 +509,11 @@ e_value <- function(rr, rr_lower = NULL) { #' @param gamma_range Numeric vector of \eqn{\Gamma} values to test. #' @return Data frame with columns: `gamma`, `p_lower`, `p_upper`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_sensitivity_rosenbaum(treated = rnorm(30, 0.5), control = rnorm(30)) #' @export #' @references #' Rosenbaum PR (2002). *Observational Studies* (2nd ed.). Springer. -sensitivity_rosenbaum <- function(treated, control, +morie_sensitivity_rosenbaum <- function(treated, control, gamma_range = seq(1, 3, by = 0.2)) { n1 <- length(treated) n0 <- length(control) @@ -491,12 +526,12 @@ sensitivity_rosenbaum <- function(treated, control, n_pairs <- n1 * n0 # Upper bound: p-value under maximum assignment probability - p_plus <- gamma / (1 + gamma) + p_plus <- gamma / (1 + gamma) p_minus <- 1 / (1 + gamma) # Expected value and variance under gamma - E_upper <- sum(p_plus * (signs > 0) + p_minus * (signs < 0)) - E_lower <- sum(p_minus * (signs > 0) + p_plus * (signs < 0)) + E_upper <- sum(p_plus * (signs > 0) + p_minus * (signs < 0)) + E_lower <- sum(p_minus * (signs > 0) + p_plus * (signs < 0)) V <- n_pairs * p_plus * p_minus T_stat <- sum(signs > 0) @@ -519,10 +554,14 @@ sensitivity_rosenbaum <- function(treated, control, #' Estimates the ATE by: #' \deqn{\widehat{ATE} = \frac{1}{n}\sum_i \bigl[\hat{\mu}_1(X_i) - \hat{\mu}_0(X_i)\bigr]} #' -#' @inheritParams estimate_aipw +#' @inheritParams morie_estimate_aipw #' @return Named list: `ate`, `se`, `ci_lower`, `ci_upper`. +#' @examples +#' set.seed(1) +#' df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +#' morie_estimate_g_computation(df, "t", "y", "x") #' @export -estimate_g_computation <- function(data, treatment, outcome, covariates, +morie_estimate_g_computation <- function(data, treatment, outcome, covariates, outcome_model = c("linear", "logistic")) { outcome_model <- match.arg(outcome_model) fam <- if (outcome_model == "logistic") stats::binomial() else stats::gaussian() @@ -530,8 +569,10 @@ estimate_g_computation <- function(data, treatment, outcome, covariates, paste(outcome, "~", paste(c(treatment, covariates), collapse = " + ")) ) fit <- stats::glm(formula, data = data, family = fam) - data1 <- data; data1[[treatment]] <- 1 - data0 <- data; data0[[treatment]] <- 0 + data1 <- data + data1[[treatment]] <- 1 + data0 <- data + data0[[treatment]] <- 0 mu1 <- as.numeric(stats::predict(fit, newdata = data1, type = "response")) mu0 <- as.numeric(stats::predict(fit, newdata = data0, type = "response")) diffs <- mu1 - mu0 diff --git a/r-package/morie/R/cncrd.R b/r-package/morie/R/cncrd.R index cd497afa96..cd1375fd23 100644 --- a/r-package/morie/R/cncrd.R +++ b/r-package/morie/R/cncrd.R @@ -11,19 +11,19 @@ #' @return Named list: statistic (W), p_value, df, chi2, n, k. #' @importFrom stats pchisq #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_concordance_incomplete(x = rnorm(50)) #' @export -concordance_incomplete <- function(x) { +morie_concordance_incomplete <- function(x) { X <- as.matrix(x) storage.mode(X) <- "numeric" - n <- nrow(X); k <- ncol(X) + n <- nrow(X) + k <- ncol(X) if (n < 2 || k < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, df = n - 1L, - chi2 = NA_real_, n = n, k = k, - method = "Kendall's coefficient of concordance W")) + return(list( + statistic = NA_real_, p_value = NA_real_, df = n - 1L, + chi2 = NA_real_, n = n, k = k, + method = "Kendall's coefficient of concordance W" + )) } R <- matrix(NA_real_, n, k) for (j in seq_len(k)) { @@ -39,7 +39,8 @@ concordance_incomplete <- function(x) { W <- 12 * S / (k^2 * (n^3 - n)) } else { expected <- (n + 1) / 2 - S <- 0; norm <- 0 + S <- 0 + norm <- 0 for (i in seq_len(n)) { ri <- R[i, !is.na(R[i, ])] if (length(ri) > 0) { diff --git a/r-package/morie/R/cndrc.R b/r-package/morie/R/cndrc.R index d506389445..0e87f411d7 100644 --- a/r-package/morie/R/cndrc.R +++ b/r-package/morie/R/cndrc.R @@ -11,26 +11,32 @@ #' @return Named list with `winner` (1-based, or -1), `n_candidates`, #' `has_winner`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' cndrc(preference_matrix = matrix(rnorm(25), 5, 5)) #' @export cndrc <- function(preference_matrix) { M <- as.matrix(preference_matrix) - n <- nrow(M); winner <- -1L + n <- nrow(M) + winner <- -1L for (i in seq_len(n)) { beats_all <- TRUE for (j in seq_len(n)) { - if (i != j && M[i, j] <= M[j, i]) { beats_all <- FALSE; break } + if (i != j && M[i, j] <= M[j, i]) { + beats_all <- FALSE + break + } + } + if (beats_all) { + winner <- i + break } - if (beats_all) { winner <- i; break } } - list(winner = winner, n_candidates = n, - has_winner = winner > 0L, method = "condorcet_winner") + list( + winner = winner, n_candidates = n, + has_winner = winner > 0L, method = "morie_condorcet_winner" + ) } #' @keywords internal #' @rdname cndrc #' @export -condorcet_winner <- cndrc +morie_condorcet_winner <- cndrc diff --git a/r-package/morie/R/cnn1d.R b/r-package/morie/R/cnn1d.R index bb78c14e63..1be924bd4b 100644 --- a/r-package/morie/R/cnn1d.R +++ b/r-package/morie/R/cnn1d.R @@ -14,15 +14,14 @@ #' @return Named list \code{(y, estimate, output_length, method)}. #' @references Goodfellow et al. (2016), Deep Learning, Ch 9. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_cnn1d_conv1d_forward(x = rnorm(50), w = rnorm(3)) #' @export -cnn1d_conv1d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { - x <- as.numeric(x); w <- as.numeric(w) +morie_cnn1d_conv1d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { + x <- as.numeric(x) + w <- as.numeric(w) if (padding > 0) x <- c(rep(0, padding), x, rep(0, padding)) - K <- length(w); N <- length(x) + K <- length(w) + N <- length(x) if (N < K) stop(sprintf("Input length %d < kernel length %d", N, K)) out_len <- (N - K) %/% stride + 1L y <- numeric(out_len) @@ -30,11 +29,13 @@ cnn1d_conv1d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { i0 <- (j - 1L) * stride + 1L y[j] <- sum(w * x[i0:(i0 + K - 1L)]) + b } - list(y = y, estimate = y, output_length = out_len, - method = "Conv1D forward (cross-correlation)") + list( + y = y, estimate = y, output_length = out_len, + method = "Conv1D forward (cross-correlation)" + ) } -#' @rdname cnn1d_conv1d_forward +#' @rdname morie_cnn1d_conv1d_forward #' @keywords internal #' @export -conv1d_forward <- cnn1d_conv1d_forward +morie_conv1d_forward <- morie_cnn1d_conv1d_forward diff --git a/r-package/morie/R/cnn2d.R b/r-package/morie/R/cnn2d.R index b5fe9cb050..5110b27d9f 100644 --- a/r-package/morie/R/cnn2d.R +++ b/r-package/morie/R/cnn2d.R @@ -14,21 +14,23 @@ #' @return Named list \code{(y, estimate, output_shape, method)}. #' @references Goodfellow et al. (2016), Deep Learning, Ch 9. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_cnn2d_conv2d_forward(x = rnorm(50), w = rnorm(3)) #' @export -cnn2d_conv2d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { - x <- as.matrix(x); w <- as.matrix(w) +morie_cnn2d_conv2d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { + x <- as.matrix(x) + w <- as.matrix(w) if (padding > 0) { pad <- padding x <- rbind(matrix(0, pad, ncol(x)), x, matrix(0, pad, ncol(x))) x <- cbind(matrix(0, nrow(x), pad), x, matrix(0, nrow(x), pad)) } - H <- nrow(x); W <- ncol(x); Kh <- nrow(w); Kw <- ncol(w) - if (H < Kh || W < Kw) + H <- nrow(x) + W <- ncol(x) + Kh <- nrow(w) + Kw <- ncol(w) + if (H < Kh || W < Kw) { stop(sprintf("Input (%d,%d) smaller than kernel (%d,%d)", H, W, Kh, Kw)) + } out_h <- (H - Kh) %/% stride + 1L out_w <- (W - Kw) %/% stride + 1L y <- matrix(0, out_h, out_w) @@ -39,12 +41,14 @@ cnn2d_conv2d_forward <- function(x, w, b = 0, stride = 1L, padding = 0L) { y[i, j] <- sum(w * x[i0:(i0 + Kh - 1L), j0:(j0 + Kw - 1L)]) + b } } - list(y = y, estimate = y, - output_shape = c(out_h, out_w), - method = "Conv2D forward (cross-correlation)") + list( + y = y, estimate = y, + output_shape = c(out_h, out_w), + method = "Conv2D forward (cross-correlation)" + ) } -#' @rdname cnn2d_conv2d_forward +#' @rdname morie_cnn2d_conv2d_forward #' @keywords internal #' @export -conv2d_forward <- cnn2d_conv2d_forward +morie_conv2d_forward <- morie_cnn2d_conv2d_forward diff --git a/r-package/morie/R/cnnge.R b/r-package/morie/R/cnnge.R index 066766eb53..6b8e33cd6e 100644 --- a/r-package/morie/R/cnnge.R +++ b/r-package/morie/R/cnnge.R @@ -13,30 +13,35 @@ #' @return list(estimate, y_hat, W_conv, b_conv, W1, b1, w2, b2, se, n, method). #' @references Montesinos Lopez Ch 13. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_cnn_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -cnn_genomic <- function(x, y, markers, n_filters = 8, kernel = 3, - hidden = 8, n_epochs = 150, lr = 1e-2, - l2 = 1e-3, seed = 0, - deterministic_seed = NULL) { +morie_cnn_genomic <- function(x, y, markers, n_filters = 8, kernel = 3, + hidden = 8, n_epochs = 150, lr = 1e-2, + l2 = 1e-3, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("cnnge", deterministic_seed) } else { set.seed(seed) } - y <- as.numeric(y); n <- length(y) - M <- as.matrix(markers); m <- ncol(M) + y <- as.numeric(y) + n <- length(y) + M <- as.matrix(markers) + m <- ncol(M) if (kernel > m) kernel <- max(1, m) - M_mu <- colMeans(M); M_sd <- apply(M, 2, stats::sd); M_sd[M_sd == 0] <- 1 + M_mu <- colMeans(M) + M_sd <- apply(M, 2, stats::sd) + M_sd[M_sd == 0] <- 1 Ms <- sweep(sweep(M, 2, M_mu), 2, M_sd, "/") - Wc <- matrix(stats::rnorm(kernel * n_filters, 0, 1 / sqrt(kernel)), - kernel, n_filters) + Wc <- matrix( + stats::rnorm(kernel * n_filters, 0, 1 / sqrt(kernel)), + kernel, n_filters + ) bc <- rep(0, n_filters) - W1 <- matrix(stats::rnorm(n_filters * hidden, 0, 1 / sqrt(n_filters)), - n_filters, hidden) + W1 <- matrix( + stats::rnorm(n_filters * hidden, 0, 1 / sqrt(n_filters)), + n_filters, hidden + ) b1 <- rep(0, hidden) w2 <- stats::rnorm(hidden, 0, 1 / sqrt(hidden)) b2 <- mean(y) @@ -60,9 +65,12 @@ cnn_genomic <- function(x, y, markers, n_filters = 8, kernel = 3, y_hat <- as.numeric(h %*% w2) + b2 resid <- y_hat - y dy <- resid / n - dw2 <- as.numeric(crossprod(h, dy)) + l2 * w2; db2 <- sum(dy) - dh <- outer(dy, w2); dh_pre <- dh * (1 - h^2) - dW1 <- crossprod(p_mat, dh_pre) + l2 * W1; db1 <- colSums(dh_pre) + dw2 <- as.numeric(crossprod(h, dy)) + l2 * w2 + db2 <- sum(dy) + dh <- outer(dy, w2) + dh_pre <- dh * (1 - h^2) + dW1 <- crossprod(p_mat, dh_pre) + l2 * W1 + db1 <- colSums(dh_pre) dp <- dh_pre %*% t(W1) da <- array(0, dim = c(n, L, n_filters)) for (s in seq_len(L)) da[, s, ] <- dp / L @@ -74,21 +82,28 @@ cnn_genomic <- function(x, y, markers, n_filters = 8, kernel = 3, dWc <- dWc + crossprod(seg, dz[, s, ]) } dWc <- dWc + l2 * Wc - Wc <- Wc - lr * dWc; bc <- bc - lr * dbc - W1 <- W1 - lr * dW1; b1 <- b1 - lr * db1 - w2 <- w2 - lr * dw2; b2 <- b2 - lr * db2 + Wc <- Wc - lr * dWc + bc <- bc - lr * dbc + W1 <- W1 - lr * dW1 + b1 <- b1 - lr * db1 + w2 <- w2 - lr * dw2 + b2 <- b2 - lr * db2 losses[ep] <- mean(resid^2) } - z <- conv(Ms); a <- pmax(z, 0); p_mat <- apply(a, c(1, 3), mean) + z <- conv(Ms) + a <- pmax(z, 0) + p_mat <- apply(a, c(1, 3), mean) h <- tanh(sweep(p_mat %*% W1, 2, b1, "+")) y_hat <- as.numeric(h %*% w2) + b2 resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, - W_conv = Wc, b_conv = bc, W1 = W1, b1 = b1, w2 = w2, b2 = b2, - loss_curve = losses, se = sqrt(mean(resid^2)), - n = n, method = "Conv1D + GAP + dense (base R)") + list( + estimate = mean(y_hat), y_hat = y_hat, + W_conv = Wc, b_conv = bc, W1 = W1, b1 = b1, w2 = w2, b2 = b2, + loss_curve = losses, se = sqrt(mean(resid^2)), + n = n, method = "Conv1D + GAP + dense (base R)" + ) } # CANONICAL TEST # set.seed(7); M <- matrix(rnorm(160), 20, 8); y <- M[,2]+M[,4]+0.2*rnorm(20) -# cnn_genomic(rep(0,20), y, M, n_epochs=20, seed=7) +# morie_cnn_genomic(rep(0,20), y, M, n_epochs=20, seed=7) diff --git a/r-package/morie/R/cntgc.R b/r-package/morie/R/cntgc.R index 6de967280f..2f5e2db072 100644 --- a/r-package/morie/R/cntgc.R +++ b/r-package/morie/R/cntgc.R @@ -6,33 +6,34 @@ #' maximum attainable C = sqrt((min(r,c)-1)/min(r,c)). #' #' @param x A 2-D contingency table of counts. -#' @return Named list: statistic (C), cramers_v, chi2, p_value, df, +#' @return Named list: statistic (C), morie_cramers_v, chi2, p_value, df, #' max_C, n. #' @importFrom stats chisq.test #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_contingency_coefficient(x = matrix(sample(1:5, 50, TRUE), 10, 5)) #' @export -contingency_coefficient <- function(x) { +morie_contingency_coefficient <- function(x) { X <- as.matrix(x) if (length(dim(X)) != 2L || length(X) == 0L) { - return(list(statistic = NA_real_, cramers_v = NA_real_, - chi2 = NA_real_, p_value = NA_real_, df = NA_integer_, - max_C = NA_real_, n = 0L, - method = "Pearson contingency coefficient")) + return(list( + statistic = NA_real_, morie_cramers_v = NA_real_, + chi2 = NA_real_, p_value = NA_real_, df = NA_integer_, + max_C = NA_real_, n = 0L, + method = "Pearson contingency coefficient" + )) } ct <- suppressWarnings(stats::chisq.test(X, correct = FALSE)) n_total <- sum(X) chi2 <- as.numeric(ct$statistic) C <- sqrt(chi2 / (chi2 + n_total)) - r <- nrow(X); c <- ncol(X); mn <- min(r, c) + r <- nrow(X) + c <- ncol(X) + mn <- min(r, c) V <- if (mn > 1) sqrt(chi2 / (n_total * (mn - 1))) else NA_real_ max_C <- if (mn > 1) sqrt((mn - 1) / mn) else NA_real_ list( statistic = C, - cramers_v = V, + morie_cramers_v = V, chi2 = chi2, p_value = as.numeric(ct$p.value), df = as.integer(ct$parameter), diff --git a/r-package/morie/R/cntrl.R b/r-package/morie/R/cntrl.R index 89acb01480..7314817c17 100644 --- a/r-package/morie/R/cntrl.R +++ b/r-package/morie/R/cntrl.R @@ -14,20 +14,24 @@ #' @return list: estimate, se, c_coef, var_ratio_cv_over_crude, n, method. #' @keywords internal cntrl_estimator <- function(y, c_var, mu_c) { - y <- as.numeric(y); cc <- as.numeric(c_var) + y <- as.numeric(y) + cc <- as.numeric(c_var) n <- length(y) - if (n < 2L || length(cc) != n) + if (n < 2L || length(cc) != n) { return(list(estimate = NA_real_, n = n, method = "control-variates (bad input)")) + } c_coef <- stats::cov(y, cc) / stats::var(cc) theta_cv <- mean(y) - c_coef * (mean(cc) - mu_c) se_cv <- sqrt(stats::var(y - c_coef * (cc - mu_c)) / n) rho <- stats::cor(y, cc) - list(estimate = as.numeric(theta_cv), - se = as.numeric(se_cv), - c_coef = as.numeric(c_coef), - var_ratio_cv_over_crude = as.numeric(1 - rho^2), - n = as.integer(n), - method = "Control variates (Nelson 1990)") + list( + estimate = as.numeric(theta_cv), + se = as.numeric(se_cv), + c_coef = as.numeric(c_coef), + var_ratio_cv_over_crude = as.numeric(1 - rho^2), + n = as.integer(n), + method = "Control variates (Nelson 1990)" + ) } # CANONICAL TEST @@ -39,4 +43,4 @@ cntrl_estimator <- function(y, c_var, mu_c) { #' @rdname cntrl_estimator #' @keywords internal #' @export -control_variates <- cntrl_estimator +morie_control_variates <- cntrl_estimator diff --git a/r-package/morie/R/cohrc.R b/r-package/morie/R/cohrc.R index 69b4f0e0f7..f910cab56b 100644 --- a/r-package/morie/R/cohrc.R +++ b/r-package/morie/R/cohrc.R @@ -1,45 +1,52 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -#' Magnitude-squared coherence between two time series +#' Magnitude-squared morie_coherence between two time series #' #' @param x Numeric vector. #' @param y Numeric vector (same length). #' @param nperseg Segment length. Default n/4. #' @param fs Sampling frequency. Default 1. -#' @return Named list with \code{frequencies, coherence, n_segments, +#' @return Named list with \code{frequencies, morie_coherence, n_segments, #' nperseg, fs, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_coherence(x = rnorm(50), y = rnorm(50)) #' @export -coherence <- function(x, y, nperseg = NULL, fs = 1) { - x <- as.numeric(x); y <- as.numeric(y) +morie_coherence <- function(x, y, nperseg = NULL, fs = 1) { + x <- as.numeric(x) + y <- as.numeric(y) if (length(x) != length(y)) stop("Length mismatch.") - n <- length(x); if (n < 8) stop("Need >=8 obs.") + n <- length(x) + if (n < 8) stop("Need >=8 obs.") if (is.null(nperseg)) nperseg <- max(n %/% 4, 4) nperseg <- min(nperseg, n) step <- nperseg %/% 2 nfreq <- nperseg %/% 2 + 1 - Sxx <- numeric(nfreq); Syy <- numeric(nfreq) - Sxy <- complex(nfreq); nseg <- 0; start <- 1 + Sxx <- numeric(nfreq) + Syy <- numeric(nfreq) + Sxy <- complex(nfreq) + nseg <- 0 + start <- 1 while (start + nperseg - 1 <= n) { xs <- x[start:(start + nperseg - 1)] - mean(x[start:(start + nperseg - 1)]) ys <- y[start:(start + nperseg - 1)] - mean(y[start:(start + nperseg - 1)]) - fx <- fft(xs)[1:nfreq]; fy <- fft(ys)[1:nfreq] + fx <- fft(xs)[1:nfreq] + fy <- fft(ys)[1:nfreq] Sxx <- Sxx + Mod(fx)^2 Syy <- Syy + Mod(fy)^2 Sxy <- Sxy + fx * Conj(fy) nseg <- nseg + 1 start <- start + step } - Sxx <- Sxx / nseg; Syy <- Syy / nseg; Sxy <- Sxy / nseg + Sxx <- Sxx / nseg + Syy <- Syy / nseg + Sxy <- Sxy / nseg denom <- pmax(Sxx * Syy, 1e-15) coh <- Mod(Sxy)^2 / denom freqs <- seq(0, fs / 2, length.out = nfreq) - list(frequencies = freqs, coherence = coh, - n_segments = nseg, nperseg = nperseg, - fs = fs, n = n, - method = "Magnitude-squared coherence (Welch, base R)") + list( + frequencies = freqs, morie_coherence = coh, + n_segments = nseg, nperseg = nperseg, + fs = fs, n = n, + method = "Magnitude-squared morie_coherence (Welch, base R)" + ) } diff --git a/r-package/morie/R/coitg.R b/r-package/morie/R/coitg.R index 416ceee30e..0f2ffd0878 100644 --- a/r-package/morie/R/coitg.R +++ b/r-package/morie/R/coitg.R @@ -7,48 +7,60 @@ #' @param max_lag Max ADF augmentation lags. Default \code{floor(12*(n/100)^0.25)}. #' @return Named list with \code{adf_statistic, p_value, beta, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_eg_coint(y1 = rnorm(100), y2 = rnorm(100)) #' @export -eg_coint <- function(y1, y2, max_lag = NULL) { - y1 <- as.numeric(y1); y2 <- as.numeric(y2) +morie_eg_coint <- function(y1, y2, max_lag = NULL) { + y1 <- as.numeric(y1) + y2 <- as.numeric(y2) if (length(y1) != length(y2)) stop("Length mismatch.") - n <- length(y1); if (n < 20) stop("Need >=20 obs.") + n <- length(y1) + if (n < 20) stop("Need >=20 obs.") if (is.null(max_lag)) max_lag <- floor(12 * (n / 100)^0.25) fit_ls <- lm(y1 ~ y2) beta <- coef(fit_ls) resid <- residuals(fit_ls) if (requireNamespace("urca", quietly = TRUE)) { - adf <- urca::ur.df(resid, type = "none", lags = max_lag, - selectlags = "AIC") + adf <- urca::ur.df(resid, + type = "none", lags = max_lag, + selectlags = "AIC" + ) stat <- as.numeric(adf@teststat[1]) } else { # Plain ADF-style t-stat on residuals. - dr <- diff(resid); T <- length(dr) - max_lag + dr <- diff(resid) + n_eff <- length(dr) - max_lag dep <- dr[(max_lag + 1):length(dr)] - Xr <- resid[(max_lag + 1):length(resid) - 1] + # Level regressor resid[t], aligned to dep; indexing to length(dr) + # (not length(resid)) keeps Xr the same length as dep. + Xr <- resid[(max_lag + 1):length(dr)] Xr <- cbind(Xr) if (max_lag >= 1) { - for (i in seq_len(max_lag)) + for (i in seq_len(max_lag)) { Xr <- cbind(Xr, dr[(max_lag + 1 - i):(length(dr) - i)]) + } } b <- lsfit(Xr, dep, intercept = FALSE) e <- dep - Xr %*% b$coef - sig2 <- sum(e^2) / (T - ncol(Xr)) + sig2 <- sum(e^2) / (n_eff - ncol(Xr)) se <- sqrt(sig2 * solve(crossprod(Xr))[1, 1]) stat <- b$coef[1] / se } crit <- c(`1%` = -3.90, `5%` = -3.34, `10%` = -3.04) - approx_p <- if (stat < crit["1%"]) 0.005 else - if (stat < crit["5%"]) 0.03 else - if (stat < crit["10%"]) 0.07 else - min(1, 2 * pnorm(stat)) - list(adf_statistic = as.numeric(stat), - p_value = as.numeric(approx_p), - beta = unname(beta), - critical_values = crit, - n = n, - method = "Engle-Granger 2-step cointegration (Engle & Granger 1987)") + approx_p <- if (stat < crit["1%"]) { + 0.005 + } else if (stat < crit["5%"]) { + 0.03 + } else if (stat < crit["10%"]) { + 0.07 + } else { + min(1, 2 * pnorm(stat)) + } + list( + adf_statistic = as.numeric(stat), + p_value = as.numeric(approx_p), + beta = unname(beta), + critical_values = crit, + n = n, + method = "Engle-Granger 2-step cointegration (Engle & Granger 1987)" + ) } diff --git a/r-package/morie/R/cokrg.R b/r-package/morie/R/cokrg.R index efaf85ecea..2573f41f81 100644 --- a/r-package/morie/R/cokrg.R +++ b/r-package/morie/R/cokrg.R @@ -1,5 +1,5 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -#' Simple cokriging for co-located bivariate spatial prediction. +#' Simple morie_cokriging for co-located bivariate spatial prediction. #' #' \deqn{\hat Z_1(s_0) = \lambda^\top Z_1 + \mu^\top Z_2}, system #' \deqn{[C_{pp} \; C_{ps}; C_{ps}^\top \; C_{ss}] [\lambda; \mu] = [c_{0p}; c_{0s}]}. @@ -15,23 +15,31 @@ #' @return Named list: estimate, se, n, method. #' @references Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' cokrg(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) #' @export cokrg <- function(x, y, coords, target, sill_p = 1, range_p = 1, sill_s = 1, range_s = 1, cross_sill = 0.5, cross_range = 1, nugget = 0) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) - target <- if (is.matrix(target)) target else - matrix(as.numeric(unlist(target)), ncol = ncol(coords)) - if (length(y) != n || nrow(coords) != n) + } + if (!is.matrix(target)) { + tv <- as.numeric(unlist(target)) + if (length(tv) %% ncol(coords) != 0L) { + stop("target/coords dim mismatch") + } + target <- matrix(tv, ncol = ncol(coords), byrow = TRUE) + } + if (length(y) != n || nrow(coords) != n) { stop("x, y, and coords must have matching n") + } if (ncol(target) != ncol(coords)) stop("target/coords dim mismatch") D <- as.matrix(stats::dist(coords)) cov_exp <- function(D_, c0, c1, a) c1 * exp(-D_ / a) + ifelse(D_ == 0, c0, 0) @@ -42,23 +50,27 @@ cokrg <- function(x, y, coords, target, z <- c(x, y) var0 <- sill_p m <- nrow(target) - ests <- numeric(m); ses <- numeric(m) + ests <- numeric(m) + ses <- numeric(m) for (k in seq_len(m)) { - d0 <- sqrt(colSums((t(coords) - target[k, ]) ^ 2)) + d0 <- sqrt(colSums((t(coords) - target[k, ])^2)) c0p <- cov_exp(d0, nugget, sill_p - nugget, range_p) c0s <- cross_sill * exp(-d0 / cross_range) c_vec <- c(c0p, c0s) w <- tryCatch(solve(C, c_vec), - error = function(e) qr.solve(C, c_vec)) + error = function(e) qr.solve(C, c_vec) + ) ests[k] <- sum(w * z) ses[k] <- sqrt(max(var0 - sum(w * c_vec), 0)) } - list(estimate = if (m == 1) ests[1] else ests, - se = if (m == 1) ses[1] else ses, n = n, - method = "Simple cokriging (linear coregionalization, exp. cov)") + list( + estimate = if (m == 1) ests[1] else ests, + se = if (m == 1) ses[1] else ses, n = n, + method = "Simple morie_cokriging (linear coregionalization, exp. cov)" + ) } #' @rdname cokrg #' @keywords internal #' @export -cokriging <- cokrg +morie_cokriging <- cokrg diff --git a/r-package/morie/R/confm.R b/r-package/morie/R/confm.R index 67aa42cacd..b649042a96 100644 --- a/r-package/morie/R/confm.R +++ b/r-package/morie/R/confm.R @@ -12,26 +12,25 @@ #' labels, precision, recall, f1, macro_precision, macro_recall, #' macro_f1, weighted_f1, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_confusion_matrix_metrics(y_true = rbinom(50, 1, 0.5), y_pred = rbinom(50, 1, 0.5)) #' @export -confusion_matrix_metrics <- function(y_true, y_pred, labels = NULL) { - yt <- as.character(y_true); yp <- as.character(y_pred) +morie_confusion_matrix_metrics <- function(y_true, y_pred, labels = NULL) { + yt <- as.character(y_true) + yp <- as.character(y_pred) if (is.null(labels)) labels <- sort(unique(c(yt, yp))) labels <- as.character(labels) K <- length(labels) cm <- matrix(0L, nrow = K, ncol = K, dimnames = list(labels, labels)) for (i in seq_along(yt)) { - a <- match(yt[i], labels); b <- match(yp[i], labels) + a <- match(yt[i], labels) + b <- match(yp[i], labels) cm[a, b] <- cm[a, b] + 1L } diag_ <- diag(cm) col_sums <- colSums(cm) row_sums <- rowSums(cm) precision <- ifelse(col_sums > 0, diag_ / col_sums, 0) - recall <- ifelse(row_sums > 0, diag_ / row_sums, 0) + recall <- ifelse(row_sums > 0, diag_ / row_sums, 0) f1 <- ifelse(precision + recall > 0, 2 * precision * recall / (precision + recall), 0) acc <- sum(diag_) / sum(cm) support <- row_sums diff --git a/r-package/morie/R/copul.R b/r-package/morie/R/copul.R index 2b362f420c..df49a860a2 100644 --- a/r-package/morie/R/copul.R +++ b/r-package/morie/R/copul.R @@ -8,27 +8,35 @@ #' #' @param x,y numeric marginal samples. #' @param family "gaussian", "clayton", or "gumbel". -#' @return list: estimate, kendall_tau, se_tau, u, v, family, n, method. -#' @importFrom stats cor.test rank +#' @return list: estimate, morie_kendall_tau, se_tau, u, v, family, n, method. +#' @importFrom stats cor.test #' @keywords internal copul <- function(x, y, family = c("gaussian", "clayton", "gumbel")) { family <- match.arg(family) - x <- as.numeric(x); y <- as.numeric(y); n <- min(length(x), length(y)) - if (n < 3L) - return(list(estimate = NA_real_, n = n, - method = paste0("copula-", family, " (n<3)"))) + x <- as.numeric(x) + y <- as.numeric(y) + n <- min(length(x), length(y)) + if (n < 3L) { + return(list( + estimate = NA_real_, n = n, + method = paste0("copula-", family, " (n<3)") + )) + } tau <- stats::cor(x[seq_len(n)], y[seq_len(n)], method = "kendall") theta <- switch(family, - gaussian = sin(pi * tau / 2), - clayton = if (tau < 1) 2 * tau / (1 - tau) else Inf, - gumbel = if (tau < 1) 1 / (1 - tau) else Inf) + gaussian = sin(pi * tau / 2), + clayton = if (tau < 1) 2 * tau / (1 - tau) else Inf, + gumbel = if (tau < 1) 1 / (1 - tau) else Inf + ) u <- (rank(x[seq_len(n)]) - 0.5) / n v <- (rank(y[seq_len(n)]) - 0.5) / n - list(estimate = as.numeric(theta), - kendall_tau = as.numeric(tau), - se_tau = sqrt((1 - tau^2) / n), - u = u, v = v, family = family, n = as.integer(n), - method = paste0("Copula ", family, " (rank-based; Nelsen 2006)")) + list( + estimate = as.numeric(theta), + morie_kendall_tau = as.numeric(tau), + se_tau = sqrt((1 - tau^2) / n), + u = u, v = v, family = family, n = as.integer(n), + method = paste0("Copula ", family, " (rank-based; Nelsen 2006)") + ) } # CANONICAL TEST @@ -40,4 +48,4 @@ copul <- function(x, y, family = c("gaussian", "clayton", "gumbel")) { #' @rdname copul #' @keywords internal #' @export -copula_estimation <- copul +morie_copula_estimation <- copul diff --git a/r-package/morie/R/cov2s.R b/r-package/morie/R/cov2s.R index 1c40463305..b41c22d5e8 100644 --- a/r-package/morie/R/cov2s.R +++ b/r-package/morie/R/cov2s.R @@ -11,19 +11,20 @@ #' @return Named list: block_freq, block_prop, expected_prop, m, n, #' cumulative, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_two_sample_coverage(x = rnorm(50), y = rnorm(50)) #' @export -two_sample_coverage <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y) +morie_two_sample_coverage <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) if (m < 1 || n < 1) { - return(list(block_freq = integer(0), block_prop = numeric(0), - expected_prop = NA_real_, m = m, n = n, - cumulative = 0L, - method = "Two-sample coverage probability")) + return(list( + block_freq = integer(0), block_prop = numeric(0), + expected_prop = NA_real_, m = m, n = n, + cumulative = 0L, + method = "Two-sample coverage probability" + )) } xs <- sort(x) # findInterval(y, xs) gives 0..m where 0 means y <= xs[1]-eps and m means y > xs[m] diff --git a/r-package/morie/R/covsp.R b/r-package/morie/R/covsp.R index 7c94858179..72f867fe8f 100644 --- a/r-package/morie/R/covsp.R +++ b/r-package/morie/R/covsp.R @@ -10,18 +10,17 @@ #' @return Named list: coverages, cumulative, expected, n, #' sample_min, sample_max, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_one_sample_coverage(x = rnorm(50)) #' @export -one_sample_coverage <- function(x) { +morie_one_sample_coverage <- function(x) { x <- as.numeric(x) n <- length(x) if (n < 2) { - return(list(coverages = numeric(0), cumulative = NA_real_, - expected = NA_real_, n = n, - method = "One-sample coverage probability")) + return(list( + coverages = numeric(0), cumulative = NA_real_, + expected = NA_real_, n = n, + method = "One-sample coverage probability" + )) } xs <- sort(x) ranks <- seq_len(n) / (n + 1) diff --git a/r-package/morie/R/cslat.R b/r-package/morie/R/cslat.R index da6e587488..6e9ee0e6b2 100644 --- a/r-package/morie/R/cslat.R +++ b/r-package/morie/R/cslat.R @@ -7,9 +7,13 @@ #' @return Named list with tensor, n, method. #' @keywords internal causal_attention_mask <- function(x) { - n <- if (length(x) == 1L && is.numeric(x)) as.integer(x) - else if (!is.null(dim(x))) dim(x)[length(dim(x)) - 1L] - else length(x) + n <- if (length(x) == 1L && is.numeric(x)) { + as.integer(x) + } else if (!is.null(dim(x))) { + dim(x)[length(dim(x)) - 1L] + } else { + length(x) + } M <- matrix(0, n, n) M[upper.tri(M)] <- -Inf list(tensor = M, n = n, method = "causal-mask") diff --git a/r-package/morie/R/cslnc.R b/r-package/morie/R/cslnc.R index 2200764455..225c28bd02 100644 --- a/r-package/morie/R/cslnc.R +++ b/r-package/morie/R/cslnc.R @@ -11,19 +11,22 @@ #' total_steps, warmup_steps, method. #' @keywords internal cosine_lr_schedule <- function(x, lr_max = 1e-3, lr_min = 0, - total_steps = 1000L, warmup_steps = 0L) { - if (total_steps <= warmup_steps) + total_steps = 1000L, warmup_steps = 0L) { + if (total_steps <= warmup_steps) { stop("total_steps must exceed warmup_steps") + } t <- as.numeric(x) warm <- t < warmup_steps lr <- numeric(length(t)) lr[warm] <- lr_max * t[warm] / max(1, warmup_steps) dec <- pmin(pmax((t - warmup_steps) / - (total_steps - warmup_steps), 0), 1) + (total_steps - warmup_steps), 0), 1) lr[!warm] <- lr_min + 0.5 * (lr_max - lr_min) * (1 + cos(pi * dec[!warm])) - list(value = lr[1L], tensor = lr, step = t, - lr_max = lr_max, lr_min = lr_min, - total_steps = total_steps, warmup_steps = warmup_steps, - method = "cosine-LR") + list( + value = lr[1L], tensor = lr, step = t, + lr_max = lr_max, lr_min = lr_min, + total_steps = total_steps, warmup_steps = warmup_steps, + method = "cosine-LR" + ) } diff --git a/r-package/morie/R/csphr.R b/r-package/morie/R/csphr.R index 4241fd894e..aad5cf5061 100644 --- a/r-package/morie/R/csphr.R +++ b/r-package/morie/R/csphr.R @@ -11,27 +11,32 @@ #' @return Named list with `w`, `c`, `midpoint`, `correct_class`, `n`, #' `p`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' csphr(x = rnorm(50)) #' @export csphr <- function(x, votes = NULL) { X <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) - n <- nrow(X); p <- ncol(X) - if (is.null(votes) || n == 0L) - return(list(w = rep(0, p), c = NA_real_, midpoint = rep(NA_real_, p), - correct_class = 0L, n = n, p = p, - method = "cutting_plane_sphere")) + n <- nrow(X) + p <- ncol(X) + if (is.null(votes) || n == 0L) { + return(list( + w = rep(0, p), c = NA_real_, midpoint = rep(NA_real_, p), + correct_class = 0L, n = n, p = p, + method = "morie_cutting_plane_sphere" + )) + } y <- as.integer(votes) - Xy <- X[y == 1, , drop = FALSE]; Xn <- X[y == 0, , drop = FALSE] + Xy <- X[y == 1, , drop = FALSE] + Xn <- X[y == 0, , drop = FALSE] if (nrow(Xy) == 0L || nrow(Xn) == 0L) { cc <- max(sum(y == 1L), sum(y == 0L)) - return(list(w = rep(0, p), c = NA_real_, midpoint = rep(NA_real_, p), - correct_class = as.integer(cc), n = n, p = p, - method = "cutting_plane_sphere")) + return(list( + w = rep(0, p), c = NA_real_, midpoint = rep(NA_real_, p), + correct_class = as.integer(cc), n = n, p = p, + method = "morie_cutting_plane_sphere" + )) } - mu_y <- colMeans(Xy); mu_n <- colMeans(Xn) + mu_y <- colMeans(Xy) + mu_n <- colMeans(Xn) S_y <- if (nrow(Xy) > 1L) stats::var(Xy) else matrix(0, p, p) S_n <- if (nrow(Xn) > 1L) stats::var(Xn) else matrix(0, p, p) S <- ((nrow(Xy) - 1) * S_y + (nrow(Xn) - 1) * S_n) / max(n - 2L, 1L) @@ -41,12 +46,18 @@ csphr <- function(x, votes = NULL) { c_int <- as.numeric(w %*% midpoint) pred <- as.integer(X %*% w > c_int) cc <- sum(pred == y) - if (cc < n - cc) { w <- -w; c_int <- -c_int; cc <- n - cc } - list(w = w, c = c_int, midpoint = midpoint, correct_class = as.integer(cc), - n = n, p = p, method = "cutting_plane_sphere") + if (cc < n - cc) { + w <- -w + c_int <- -c_int + cc <- n - cc + } + list( + w = w, c = c_int, midpoint = midpoint, correct_class = as.integer(cc), + n = n, p = p, method = "morie_cutting_plane_sphere" + ) } #' @keywords internal #' @rdname csphr #' @export -cutting_plane_sphere <- csphr +morie_cutting_plane_sphere <- csphr diff --git a/r-package/morie/R/ctmed.R b/r-package/morie/R/ctmed.R index d5c6e11f3b..39c1bdf5f9 100644 --- a/r-package/morie/R/ctmed.R +++ b/r-package/morie/R/ctmed.R @@ -10,25 +10,30 @@ #' @return Named list: statistic, p_value, df, n, grand_median, table. #' @importFrom stats median chisq.test #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_control_median_test(x = rnorm(50), y = rnorm(50)) #' @export -control_median_test <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y) +morie_control_median_test <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) if (m < 2 || n < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, df = 1L, - n = m + n, grand_median = NA_real_, - method = "Control-median (Mood's median) test")) + return(list( + statistic = NA_real_, p_value = NA_real_, df = 1L, + n = m + n, grand_median = NA_real_, + method = "Control-median (Mood's median) test" + )) } med <- stats::median(c(x, y)) # Ties: count == as below (matches scipy 'below') - tbl <- matrix(c(sum(x > med), sum(x <= med), - sum(y > med), sum(y <= med)), - nrow = 2, byrow = TRUE, - dimnames = list(c("x", "y"), c("above", "below_eq"))) + tbl <- matrix( + c( + sum(x > med), sum(x <= med), + sum(y > med), sum(y <= med) + ), + nrow = 2, byrow = TRUE, + dimnames = list(c("x", "y"), c("above", "below_eq")) + ) ct <- suppressWarnings(stats::chisq.test(tbl, correct = TRUE)) list( statistic = as.numeric(ct$statistic), diff --git a/r-package/morie/R/ctrlc.R b/r-package/morie/R/ctrlc.R index 37c64e3282..652b3f2a34 100644 --- a/r-package/morie/R/ctrlc.R +++ b/r-package/morie/R/ctrlc.R @@ -13,42 +13,47 @@ #' @return Named list: statistic, p_value, p_adjusted, n, k, control_n. #' @importFrom stats wilcox.test p.adjust #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_control_comparison(groups = list(rnorm(20), rnorm(20), rnorm(20))) #' @export -control_comparison <- function(groups, control_index = 1L, - adjust = c("bonferroni", "none")) { +morie_control_comparison <- function(groups, control_index = 1L, + adjust = c("bonferroni", "none")) { adjust <- match.arg(adjust) - if (!is.list(groups) || length(groups) < 2L) - return(list(statistic = numeric(0), p_value = numeric(0), - p_adjusted = numeric(0), n = integer(0), - k = 0L, control_n = 0L, - method = "Many-to-one control comparison")) + if (!is.list(groups) || length(groups) < 2L) { + return(list( + statistic = numeric(0), p_value = numeric(0), + p_adjusted = numeric(0), n = integer(0), + k = 0L, control_n = 0L, + method = "Many-to-one control comparison" + )) + } arrs <- lapply(groups, as.numeric) ctrl <- arrs[[control_index]] trts <- arrs[-control_index] k <- length(trts) - Us <- numeric(k); ps <- numeric(k) + Us <- numeric(k) + ps <- numeric(k) for (i in seq_along(trts)) { if (min(length(ctrl), length(trts[[i]])) < 2) { - Us[i] <- NA_real_; ps[i] <- NA_real_; next + Us[i] <- NA_real_ + ps[i] <- NA_real_ + next } r <- suppressWarnings(stats::wilcox.test(ctrl, trts[[i]], - exact = FALSE, - correct = FALSE)) + exact = FALSE, + correct = FALSE + )) Us[i] <- as.numeric(r$statistic) ps[i] <- as.numeric(r$p.value) } p_adj <- switch(adjust, - bonferroni = pmin(ps * k, 1), - none = ps) + bonferroni = pmin(ps * k, 1), + none = ps + ) list( statistic = Us, p_value = ps, p_adjusted = p_adj, - n = sapply(trts, length), + n = vapply(trts, length, integer(1)), k = k, control_n = length(ctrl), adjust = adjust, diff --git a/r-package/morie/R/data_access.R b/r-package/morie/R/data_access.R new file mode 100644 index 0000000000..d484cc54e2 --- /dev/null +++ b/r-package/morie/R/data_access.R @@ -0,0 +1,435 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# data_access.R -- generic open-data access layer for MORIE. +# +# Three public entry points let users reach data sources beyond the +# built-in catalog: +# * morie_fetch() -- universal URL fetcher, auto-detects format +# * morie_ckan_search() -- discover datasets on any CKAN portal +# * morie_fetch_arcgis() -- query an ArcGIS FeatureServer/MapServer layer +# +# Everything is automatic by default (format is detected, pagination is +# handled) but every step can be overridden by the caller. + +# --- internal helpers ------------------------------------------------------ + +# Append a named list of query parameters to a URL, URL-encoding values. +.morie_url_with_params <- function(url, params = NULL) { + if (is.null(params) || length(params) == 0L) { + return(url) + } + params <- params[!vapply(params, is.null, logical(1))] + if (length(params) == 0L) { + return(url) + } + kv <- vapply(seq_along(params), function(i) { + paste0( + utils::URLencode(names(params)[i], reserved = TRUE), "=", + utils::URLencode(as.character(params[[i]]), reserved = TRUE) + ) + }, character(1)) + sep <- if (grepl("?", url, fixed = TRUE)) "&" else "?" + paste0(url, sep, paste(kv, collapse = "&")) +} + +# Known CKAN portals. A caller may also pass a full base URL directly. +.MORIE_CKAN_PORTALS <- c( + "open.canada.ca" = "https://open.canada.ca/data/en", + "data.ontario.ca" = "https://data.ontario.ca", + "open.toronto.ca" = + "https://ckan0.cf.opendata.inter.prod-toronto.ca" +) + +.morie_ckan_portal <- function(portal) { + if (grepl("^https?://", portal)) { + return(sub("/+$", "", portal)) + } + if (portal %in% names(.MORIE_CKAN_PORTALS)) { + return(.MORIE_CKAN_PORTALS[[portal]]) + } + stop("Unknown CKAN portal '", portal, "'. Known portals: ", + paste(names(.MORIE_CKAN_PORTALS), collapse = ", "), + " -- or pass a full https:// base URL.", + call. = FALSE + ) +} + +# Read text from a URL (used for JSON/XML/HTML API responses). +.morie_read_text <- function(url) { + con <- url(url) + on.exit(close(con), add = TRUE) + paste(readLines(con, warn = FALSE), collapse = "\n") +} + +# Download a URL to a temp file, returning the local path. +.morie_download <- function(url, ext = "") { + if (!nzchar(ext)) ext <- tools::file_ext(sub("\\?.*$", "", url)) + tmp <- tempfile(fileext = if (nzchar(ext)) paste0(".", ext) else "") + utils::download.file(url, tmp, mode = "wb", quiet = TRUE) + tmp +} + +# Detect the format of a URL from its HTTP Content-Type header, falling +# back to the URL file extension. Returns one of the morie_fetch formats. +.morie_detect_format <- function(url) { + ct <- tryCatch( + { + h <- suppressWarnings(curlGetHeaders(url)) + line <- grep("^content-type:", tolower(h), value = TRUE) + if (length(line)) sub("^content-type:\\s*", "", line[length(line)]) else "" + }, + error = function(e) "" + ) + ct <- tolower(ct) + if (grepl("zip", ct)) { + return("zip") + } + if (grepl("json", ct)) { + return("json") + } + if (grepl("csv", ct)) { + return("csv") + } + if (grepl("tab-separated", ct)) { + return("tsv") + } + if (grepl("spreadsheet|ms-excel|officedocument", ct)) { + return("xlsx") + } + if (grepl("xml", ct)) { + return("xml") + } + if (grepl("html", ct)) { + return("html") + } + ext <- tolower(tools::file_ext(sub("\\?.*$", "", url))) + switch(ext, + zip = "zip", + json = "json", + csv = "csv", + tsv = "tsv", + txt = "csv", + xlsx = "xlsx", + xls = "xlsx", + xml = "xml", + html = "html", + htm = "html", + "csv" + ) # last-resort default +} + +# Parse a downloaded local file according to a known format. +.morie_parse_file <- function(path, format, simplify, ...) { + if (format %in% c("xlsx")) { + if (!requireNamespace("readxl", quietly = TRUE)) { + stop("Package 'readxl' is required to read xlsx data.", call. = FALSE) + } + return(as.data.frame(readxl::read_excel(path, ...))) + } + if (format == "tsv") { + return(utils::read.delim(path, + stringsAsFactors = FALSE, + check.names = FALSE, ... + )) + } + if (format == "csv") { + return(utils::read.csv(path, + stringsAsFactors = FALSE, + check.names = FALSE, ... + )) + } + if (format == "json") { + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("Package 'jsonlite' is required to read JSON data.", call. = FALSE) + } + return(jsonlite::fromJSON(path, simplifyVector = simplify)) + } + if (format == "xml") { + if (!requireNamespace("xml2", quietly = TRUE)) { + stop("Package 'xml2' is required to read XML data.", call. = FALSE) + } + doc <- xml2::read_xml(path) + return(if (simplify) xml2::as_list(doc) else doc) + } + if (format == "html") { + if (!requireNamespace("xml2", quietly = TRUE)) { + stop("Package 'xml2' is required to read HTML data.", call. = FALSE) + } + doc <- xml2::read_html(path) + if (simplify && requireNamespace("rvest", quietly = TRUE)) { + tbls <- rvest::html_table(doc) + if (length(tbls) == 1L) { + return(as.data.frame(tbls[[1L]])) + } + if (length(tbls) > 1L) { + return(lapply(tbls, as.data.frame)) + } + } + return(doc) + } + stop("Unsupported parse format: ", format, call. = FALSE) +} + +# --- morie_fetch ----------------------------------------------------------- + +#' Fetch a dataset from any URL, with automatic format detection +#' +#' A universal data-access entry point. Given a URL, MORIE detects the +#' format from the HTTP \code{Content-Type} header (falling back to the +#' URL extension), downloads the resource, and parses it into an R +#' object. The behaviour is automatic by default but every step is +#' controllable: pass an explicit \code{format}, extra query +#' \code{params}, a \code{zip_member} to extract, or reader arguments +#' via \code{...}. +#' +#' Supported formats: \code{csv}, \code{tsv}, \code{json}, \code{xml}, +#' \code{html}, \code{xlsx}, \code{zip} (extract one member), and +#' \code{arcgis} (delegates to \code{\link{morie_fetch_arcgis}}). +#' +#' @param url The resource URL. +#' @param format One of \code{"auto"} (default), \code{"csv"}, +#' \code{"tsv"}, \code{"json"}, \code{"xml"}, \code{"html"}, +#' \code{"xlsx"}, \code{"zip"}, \code{"arcgis"}. +#' @param params Optional named list appended to \code{url} as a +#' URL-encoded query string. +#' @param zip_member For \code{zip} downloads, the archive member to +#' extract (matched by basename, then by substring). +#' @param simplify For \code{json}/\code{xml}/\code{html}, whether to +#' simplify into a data.frame where possible (default \code{TRUE}). +#' @param ... Passed to the underlying reader (e.g. \code{\link{read.csv}} +#' arguments, or \code{\link{morie_fetch_arcgis}} arguments). +#' @return A data.frame for tabular formats; a list or document object +#' for non-tabular \code{json}/\code{xml}/\code{html}. +#' @examples +#' \dontrun{ +#' # Examples use placeholder URLs (example.org). Replace with a +#' # real CSV / JSON endpoint when running. +#' df <- morie_fetch("https://example.org/data.csv") +#' js <- morie_fetch("https://api.example.org/records", +#' format = "json", params = list(limit = 100) +#' ) +#' } +#' @seealso \code{\link{morie_ckan_search}}, \code{\link{morie_fetch_arcgis}} +#' @export +morie_fetch <- function(url, + format = c( + "auto", "csv", "tsv", "json", "xml", + "html", "xlsx", "zip", "arcgis" + ), + params = NULL, zip_member = "", simplify = TRUE, + ...) { + format <- match.arg(format) + full_url <- .morie_url_with_params(url, params) + + if (format == "arcgis") { + return(morie_fetch_arcgis(url, params = params, ...)) + } + + if (format == "auto") format <- .morie_detect_format(full_url) + + if (format == "zip") { + if (!nzchar(zip_member)) { + stop("A 'zip_member' is required to extract from a .zip resource.", + call. = FALSE + ) + } + zpath <- .morie_download(full_url, ext = "zip") + on.exit(unlink(zpath), add = TRUE) + exdir <- tempfile("morie-unzip-") + dir.create(exdir) + on.exit(unlink(exdir, recursive = TRUE), add = TRUE) + members <- utils::unzip(zpath, list = TRUE)$Name + hit <- members[basename(members) == zip_member] + if (length(hit) == 0L) { + hit <- members[grepl(zip_member, members, fixed = TRUE)] + } + if (length(hit) == 0L) { + stop("zip member '", zip_member, "' not found in ", url, call. = FALSE) + } + utils::unzip(zpath, files = hit[1L], exdir = exdir, junkpaths = TRUE) + inner <- file.path(exdir, basename(hit[1L])) + inner_fmt <- .morie_detect_format(inner) + return(.morie_parse_file(inner, inner_fmt, simplify, ...)) + } + + if (format %in% c("csv", "tsv", "xlsx")) { + path <- .morie_download(full_url, ext = format) + on.exit(unlink(path), add = TRUE) + return(.morie_parse_file(path, format, simplify, ...)) + } + + # json / xml / html: read the response text into a temp file. + txt <- .morie_read_text(full_url) + path <- tempfile(fileext = paste0(".", format)) + on.exit(unlink(path), add = TRUE) + writeLines(txt, path, useBytes = TRUE) + .morie_parse_file(path, format, simplify, ...) +} + +# --- morie_ckan_search ----------------------------------------------------- + +#' Search any CKAN open-data portal for datasets +#' +#' Wraps the CKAN \code{package_search} action so users can discover +#' datasets that are not in the built-in MORIE catalog and fetch them +#' through \code{\link{morie_fetch_ckan}} or \code{\link{morie_fetch}}. +#' +#' @param query Free-text search string. +#' @param portal A known portal name (\code{"open.canada.ca"}, +#' \code{"data.ontario.ca"}, \code{"open.toronto.ca"}) or a full +#' CKAN base URL (e.g. \code{"https://catalogue.example.org"}). +#' @param rows Maximum number of datasets to return (default 25). +#' @param ... Extra named CKAN \code{package_search} parameters +#' (e.g. \code{fq = "res_format:CSV"}, \code{sort = "metadata_modified desc"}). +#' @return A data.frame with one row per resource, columns: +#' \code{dataset_title}, \code{dataset_id}, \code{resource_id}, +#' \code{resource_name}, \code{format}, \code{datastore_active}, +#' \code{url}. Feed \code{resource_id} into +#' \code{morie_fetch_ckan(resource_id = ...)}. +#' @examples +#' \dontrun{ +#' hits <- morie_ckan_search("cannabis survey", portal = "open.canada.ca") +#' head(hits[, c("dataset_title", "resource_id", "format")]) +#' } +#' @seealso \code{\link{morie_fetch_ckan}}, \code{\link{morie_fetch}} +#' @export +morie_ckan_search <- function(query, portal = "open.canada.ca", + rows = 25L, ...) { + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("Package 'jsonlite' is required for morie_ckan_search().", + call. = FALSE + ) + } + base <- .morie_ckan_portal(portal) + api <- paste0(base, "/api/3/action/package_search") + url <- .morie_url_with_params( + api, c(list(q = query, rows = as.integer(rows)), list(...)) + ) + payload <- jsonlite::fromJSON(.morie_read_text(url), simplifyVector = FALSE) + if (!isTRUE(payload$success)) { + stop("CKAN package_search failed on portal ", base, call. = FALSE) + } + results <- payload$result$results + if (length(results) == 0L) { + return(data.frame( + dataset_title = character(0), dataset_id = character(0), + resource_id = character(0), resource_name = character(0), + format = character(0), datastore_active = logical(0), + url = character(0), stringsAsFactors = FALSE + )) + } + rows_out <- list() + for (ds in results) { + res <- ds$resources + if (length(res) == 0L) next + for (r in res) { + rows_out[[length(rows_out) + 1L]] <- data.frame( + dataset_title = .nz(ds$title, ds$name), + dataset_id = .nz(ds$id), + resource_id = .nz(r$id), + resource_name = .nz(r$name), + format = toupper(.nz(r$format)), + datastore_active = isTRUE(r$datastore_active), + url = .nz(r$url), + stringsAsFactors = FALSE + ) + } + } + do.call(rbind, rows_out) +} + +# Small helper: first non-empty scalar, else "". +.nz <- function(...) { + for (x in list(...)) { + if (!is.null(x) && length(x) >= 1L && !is.na(x[[1L]]) && + nzchar(as.character(x[[1L]]))) { + return(as.character(x[[1L]])) + } + } + "" +} + +# --- morie_fetch_arcgis ---------------------------------------------------- + +#' Query an ArcGIS FeatureServer / MapServer layer +#' +#' Pulls attribute records from an ArcGIS REST layer, paginating through +#' the server transfer limit automatically (ArcGIS caps a single query +#' at \code{maxRecordCount} features, typically 1000-2000). +#' +#' @param layer_url The layer URL, ending in \code{/FeatureServer/} +#' or \code{/MapServer/}. +#' @param where SQL-style WHERE filter (default \code{"1=1"}, all rows). +#' @param out_fields Comma-separated field list (default \code{"*"}). +#' @param params Optional named list of extra query parameters. +#' @param page_size Records requested per page (default 2000). +#' @param max_records Cap on the total number of records (default +#' \code{Inf} -- fetch the whole layer). +#' @return A data.frame of feature attributes (geometry is dropped). +#' @examples +#' \dontrun{ +#' layer <- paste0( +#' "https://services.arcgis.com/ORG/arcgis/rest/", +#' "services/Assault/FeatureServer/0" +#' ) +#' df <- morie_fetch_arcgis(layer) +#' } +#' @seealso \code{\link{morie_fetch}} +#' @export +morie_fetch_arcgis <- function(layer_url, where = "1=1", out_fields = "*", + params = NULL, page_size = 2000L, + max_records = Inf) { + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("Package 'jsonlite' is required for morie_fetch_arcgis().", + call. = FALSE + ) + } + layer_url <- sub("/+$", "", layer_url) + query_url <- paste0(layer_url, "/query") + offset <- 0L + fetched <- 0L + pages <- list() + repeat { + this_page <- min(page_size, max_records - fetched) + if (this_page <= 0L) break + p <- c(list( + where = where, outFields = out_fields, + returnGeometry = "false", f = "json", + resultOffset = offset, + resultRecordCount = as.integer(this_page) + ), params) + payload <- jsonlite::fromJSON( + .morie_read_text(.morie_url_with_params(query_url, p)), + simplifyVector = TRUE + ) + if (!is.null(payload$error)) { + stop("ArcGIS query error: ", + .nz(payload$error$message, "unknown"), + call. = FALSE + ) + } + feats <- payload$features + attrs <- if (is.null(feats) || NROW(feats) == 0L) { + NULL + } else if (is.data.frame(feats) && !is.null(feats$attributes)) { + feats$attributes + } else { + feats + } + if (is.null(attrs) || NROW(attrs) == 0L) break + attrs <- as.data.frame(attrs, stringsAsFactors = FALSE) + pages[[length(pages) + 1L]] <- attrs + fetched <- fetched + NROW(attrs) + if (!isTRUE(payload$exceededTransferLimit) || fetched >= max_records) { + break + } + offset <- offset + NROW(attrs) + } + if (length(pages) == 0L) { + return(data.frame()) + } + if (length(pages) == 1L) { + return(pages[[1L]]) + } + do.call(rbind, pages) +} diff --git a/r-package/morie/R/database.R b/r-package/morie/R/database.R index 656bdd51df..394e9365cf 100644 --- a/r-package/morie/R/database.R +++ b/r-package/morie/R/database.R @@ -1,16 +1,141 @@ -# database.R -- DBI/RSQLite data layer for MORIE +# database.R -- DBI-backed generic-SQL data layer for MORIE # -# Built-in database: inst/extdata/morie.db ships with the package. -# User cache: morie.db under the per-user cache directory. -# Both R (DBI/RSQLite) and Python (sqlite3) share the same SQLite files. - -# Per-user cache directory -- portable, always user-writable, and the -# same location the Python package uses, so the SQLite cache is shared. -# Honours XDG_CACHE_HOME; otherwise ~/.cache/morie. -morie_cache_dir <- function() { - base <- Sys.getenv("XDG_CACHE_HOME", "") - if (!nzchar(base)) base <- file.path(path.expand("~"), ".cache") - file.path(base, "morie") +# Built-in database: inst/extdata/morie.db ships with the package and is +# always SQLite (read-only; portable across R and Python). +# +# User cache: any DBI-compatible backend. The default is SQLite at +# morie.db under the per-user cache directory. Users who want a server +# backend (PostgreSQL, MariaDB) or a columnar one (DuckDB) pass a +# pre-opened DBI connection via the `con` argument on every cache +# function. The same code path then talks to that backend through DBI. +# +# Examples: +# # default SQLite (current behaviour) +# morie_cache_store(df, "tbl") +# +# # DuckDB +# con <- DBI::dbConnect(duckdb::duckdb(), dbdir = "morie.duckdb") +# morie_cache_store(df, "tbl", con = con) +# +# # PostgreSQL +# con <- DBI::dbConnect(RPostgres::Postgres(), +# host = "localhost", dbname = "morie", user = "...") +# morie_load_dataset("ocp21", con = con) + +# Internal: resolve a DBI connection. Accepts a pre-opened connection +# (used as-is, caller owns disconnection) OR a SQLite path string (we +# open + own + close). The default path is the per-user cache. +# +# Returns: list(con = DBIConnection, close = logical). +.morie_db_handle <- function(con = NULL, db_path = NULL) { + if (!is.null(con)) { + if (!inherits(con, "DBIConnection")) { + stop("`con` must be a DBIConnection (see `?DBI::dbConnect`).", + call. = FALSE + ) + } + return(list(con = con, close = FALSE)) + } + list(con = morie_db_connect(db_path), close = TRUE) +} + +#' morie cache contract +#' +#' morie functions that persist artifacts to disk (e.g. +#' \code{morie_fetch_siu(cache_html = TRUE)}) default to a +#' \emph{session-scoped} subdirectory of \code{\link[base]{tempdir}()}, +#' which R automatically removes when the session ends. This is the +#' most conservative CRAN-Policy-compliant default: nothing morie +#' writes ever survives the R session unless the user explicitly +#' opts in. +#' +#' Users who want \emph{persistent} caching across sessions opt in by +#' passing the result of \code{morie_cache_dir(subdir)} as the +#' \code{cache_dir} argument, e.g.: +#' +#' \preformatted{ +#' morie_fetch_siu( +#' cache_dir = morie_cache_dir("siu"), +#' cache_html = TRUE +#' ) +#' } +#' +#' The persistent location is \code{tools::R_user_dir("morie", "cache")} +#' (R \eqn{\ge} 4.0), which on Linux defaults to +#' \code{~/.cache/R/morie/}, on macOS to +#' \code{~/Library/Caches/org.R-project.R/R/morie/}, and on Windows to +#' \code{\%LOCALAPPDATA\%/R/cache/R/morie/}. Users can override this +#' location by setting the \code{MORIE_CACHE_DIR} environment variable +#' before calling \code{morie_cache_dir()}. +#' +#' \strong{Active management.} CRAN Policy requires persistent caches +#' to be actively managed. Use \code{\link{morie_cache_clear}()} to +#' empty the persistent cache (or a subdirectory of it). Cached SIU +#' HTML is ~80-100 MB at full sweep, so clearing it occasionally is +#' usually unnecessary, but it is supported. +#' +#' @param subdir Optional subdirectory under the morie cache root +#' (e.g. \code{"siu"}, \code{"tps"}). If \code{NULL}, the cache +#' root itself is returned. +#' @return A file path string. The directory is \emph{not} created; +#' callers create it lazily only when they actually persist to disk. +#' @examples +#' # Persistent cache root (does not write anything to disk): +#' morie_cache_dir() +#' # Per-subsystem persistent path: +#' morie_cache_dir("siu") +#' @seealso \code{\link{morie_cache_clear}} +#' @export +morie_cache_dir <- function(subdir = NULL) { + override <- Sys.getenv("MORIE_CACHE_DIR", "") + base <- if (nzchar(override)) { + path.expand(override) + } else { + tools::R_user_dir("morie", which = "cache") + } + if (is.null(subdir)) base else file.path(base, subdir) +} + +#' Clear morie's persistent cache directory +#' +#' Removes files cached by morie under +#' \code{tools::R_user_dir("morie", "cache")} (or +#' \code{MORIE_CACHE_DIR} if set). morie's default behaviour writes +#' caches to a session-scoped \code{\link[base]{tempdir}()} +#' subdirectory, so this function only matters if you have explicitly +#' opted in to persistent caching by passing +#' \code{cache_dir = morie_cache_dir(...)} to any of the morie +#' fetchers. +#' +#' @param subdir Optional subdirectory under the morie cache root to +#' target (e.g. \code{"siu"}, \code{"tps"}). If \code{NULL}, removes +#' the entire morie persistent-cache root. +#' @param confirm If \code{TRUE} (default in interactive sessions), +#' prompts the user before deleting. Set \code{FALSE} in scripts / +#' batch use to skip the prompt. +#' @return Invisibly, the number of files removed. +#' @examples +#' \donttest{ +#' # Non-interactive: skip the confirmation prompt. +#' morie_cache_clear("siu", confirm = FALSE) +#' } +#' @seealso \code{\link{morie_cache_dir}} +#' @export +morie_cache_clear <- function(subdir = NULL, confirm = interactive()) { + path <- morie_cache_dir(subdir) + if (!dir.exists(path)) { + return(invisible(0L)) + } + if (isTRUE(confirm)) { + ans <- readline(sprintf("Delete %s ? [y/N] ", path)) + if (!tolower(trimws(ans)) %in% c("y", "yes")) { + message("Aborted.") + return(invisible(0L)) + } + } + n_files <- length(list.files(path, recursive = TRUE, full.names = TRUE)) + unlink(path, recursive = TRUE, force = TRUE) + invisible(n_files) } #' Get path to the built-in MORIE datasets database @@ -22,54 +147,109 @@ morie_cache_dir <- function() { #' #' @return File path string. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_builtin_db() #' @export morie_builtin_db <- function() { db <- system.file("extdata", "morie.db", package = "morie") - if (nzchar(db)) return(db) + if (nzchar(db)) { + return(db) + } # Source-checkout / dev fallback: the per-user cache copy. file.path(morie_cache_dir(), "morie.db") } -#' Connect to the MORIE SQLite cache database +#' Connect to the MORIE cache database +#' +#' Opens (or creates) the per-user cache database. The default backend +#' is **DuckDB** — zero-config like SQLite, but vectorised + columnar, +#' so it handles the multi-GB-scale open-data PUMFs (TPS, CPADS bulk) +#' that morie ingests without breaking down on analytical queries. For +#' back-compat, an existing SQLite cache at `morie.db` is reused; if +#' duckdb is unavailable, falls back to SQLite. #' -#' Opens (or creates) the shared cache at \code{morie.db} in the -#' per-user cache directory (\code{~/.cache/morie} or -#' \code{XDG_CACHE_HOME}). Both R (DBI/RSQLite) and Python (sqlite3) -#' read/write this same file. +#' For non-default backends (PostgreSQL, MariaDB, MS SQL Server, ...), +#' construct your own DBI connection and pass it as `con` to the +#' `morie_cache_*` and `morie_load_dataset` functions: #' -#' @param db_path Path to the SQLite file. Defaults to the -#' \code{MORIE_CACHE_DB} env var, else \code{morie.db} in the -#' per-user cache directory. +#' \preformatted{ +#' con <- DBI::dbConnect(RPostgres::Postgres(), +#' host = "...", dbname = "morie", user = "...", password = "...") +#' morie_load_dataset("ocp21", con = con) +#' } +#' +#' @param db_path Optional path to a DuckDB (\code{*.duckdb}) or SQLite +#' (\code{*.db}) file. Defaults to the \code{MORIE_CACHE_DB} env var, +#' else \code{morie.duckdb} / \code{morie.db} in the per-user cache +#' directory. #' @return A DBI connection object. #' @examples #' \donttest{ -#' if (requireNamespace("DBI", quietly = TRUE) && -#' requireNamespace("RSQLite", quietly = TRUE)) { -#' tmp <- tempfile(fileext = ".db") -#' con <- morie_db_connect(db_path = tmp) -#' DBI::dbListTables(con) -#' DBI::dbDisconnect(con) -#' file.remove(tmp) -#' } +#' # DuckDB (default when 'duckdb' is installed); pass a '.db' path for SQLite. +#' if (requireNamespace("duckdb", quietly = TRUE) && +#' requireNamespace("DBI", quietly = TRUE)) { +#' tmp <- tempfile(fileext = ".duckdb") +#' con <- morie_db_connect(db_path = tmp) +#' DBI::dbListTables(con) +#' DBI::dbDisconnect(con) +#' file.remove(tmp) +#' } #' } #' @export morie_db_connect <- function(db_path = NULL) { - if (!requireNamespace("DBI", quietly = TRUE) || - !requireNamespace("RSQLite", quietly = TRUE)) { - stop("Packages 'DBI' and 'RSQLite' are required. Install with:\n", - " install.packages(c('DBI', 'RSQLite'))", call. = FALSE) + if (!requireNamespace("DBI", quietly = TRUE)) { + stop("Package 'DBI' is required. install.packages('DBI')", + call. = FALSE + ) } + # CRAN Policy: by default never write under user HOME. When the + # caller doesn't supply a path and the MORIE_CACHE_DB env var is + # unset, default to a session-scoped subdirectory of tempdir(). R + # cleans this up when the session ends. Users opt in to persistent + # caching by passing `db_path = morie_cache_dir("morie.duckdb")` + # explicitly (or by setting the MORIE_CACHE_DB env var). + cache_dir <- file.path(tempdir(), "morie") + duckdb_default <- file.path(cache_dir, "morie.duckdb") + sqlite_default <- file.path(cache_dir, "morie.db") + if (is.null(db_path)) { db_path <- Sys.getenv("MORIE_CACHE_DB", "") if (!nzchar(db_path)) { - db_path <- file.path(morie_cache_dir(), "morie.db") + # Resolution: prefer an existing morie.duckdb; else reuse an + # existing morie.db (back-compat with the SQLite era); else + # create morie.duckdb if duckdb is available, otherwise morie.db. + if (file.exists(duckdb_default)) { + db_path <- duckdb_default + } else if (file.exists(sqlite_default)) { + db_path <- sqlite_default + } else if (requireNamespace("duckdb", quietly = TRUE)) { + db_path <- duckdb_default + } else { + db_path <- sqlite_default + } } } dir.create(dirname(db_path), recursive = TRUE, showWarnings = FALSE) + + # Dispatch on extension: .duckdb -> DuckDB; anything else -> SQLite. + is_duckdb <- grepl("\\.duckdb$", db_path, ignore.case = TRUE) + if (is_duckdb) { + if (!requireNamespace("duckdb", quietly = TRUE)) { + stop("DuckDB path requested but the 'duckdb' package isn't installed.\n", + " install.packages('duckdb') -- or pass db_path ending in '.db' ", + "for SQLite.", + call. = FALSE + ) + } + return(DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)) + } + # SQLite fallback path. + if (!requireNamespace("RSQLite", quietly = TRUE)) { + stop("SQLite path requested but the 'RSQLite' package isn't installed.\n", + " install.packages('RSQLite') -- or install 'duckdb' and pass a ", + "'.duckdb' path.", + call. = FALSE + ) + } con <- DBI::dbConnect(RSQLite::SQLite(), dbname = db_path) DBI::dbExecute(con, "PRAGMA journal_mode=WAL") con @@ -80,61 +260,84 @@ morie_db_connect <- function(db_path = NULL) { #' Writes (or replaces) a table in the shared SQLite cache. #' #' @param data A data.frame to cache. -#' @param table_name Name of the SQLite table. -#' @param db_path Optional override for the database path. +#' @param table_name Name of the destination table. +#' @param db_path Optional path to a SQLite file (default backend). +#' @param con Optional pre-opened DBI connection. When supplied, the +#' table is written through `con` and `db_path` is ignored. Use this +#' for non-SQLite backends (PostgreSQL, DuckDB, MariaDB). #' @return Number of rows written (invisible). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") +#' \donttest{ +#' db <- tempfile(fileext = ".db") +#' morie_cache_store( +#' data = data.frame(x = rnorm(50), y = rnorm(50)), +#' table_name = "demo", +#' db_path = db +#' ) +#' file.remove(db) #' } #' @export -morie_cache_store <- function(data, table_name, db_path = NULL) { - con <- morie_db_connect(db_path) - on.exit(DBI::dbDisconnect(con), add = TRUE) - DBI::dbWriteTable(con, table_name, data, overwrite = TRUE) +morie_cache_store <- function(data, table_name, db_path = NULL, con = NULL) { + h <- .morie_db_handle(con, db_path) + if (h$close) on.exit(DBI::dbDisconnect(h$con), add = TRUE) + DBI::dbWriteTable(h$con, table_name, data, overwrite = TRUE) invisible(nrow(data)) } #' Load a table from the MORIE cache #' -#' @param table_name Name of the SQLite table. -#' @param db_path Optional override for the database path. +#' @param table_name Name of the table. +#' @param db_path Optional path to a SQLite file (default backend). +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return A data.frame, or \code{NULL} if the table does not exist. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") +#' \donttest{ +#' db <- tempfile(fileext = ".db") +#' morie_cache_store( +#' data = data.frame(x = 1:5), +#' table_name = "demo", +#' db_path = db +#' ) +#' morie_cache_load(table_name = "demo", db_path = db) +#' file.remove(db) #' } #' @export -morie_cache_load <- function(table_name, db_path = NULL) { - con <- morie_db_connect(db_path) - on.exit(DBI::dbDisconnect(con), add = TRUE) - if (!DBI::dbExistsTable(con, table_name)) { +morie_cache_load <- function(table_name, db_path = NULL, con = NULL) { + h <- .morie_db_handle(con, db_path) + if (h$close) on.exit(DBI::dbDisconnect(h$con), add = TRUE) + if (!DBI::dbExistsTable(h$con, table_name)) { return(NULL) } - DBI::dbReadTable(con, table_name) + DBI::dbReadTable(h$con, table_name) } #' List all tables in the MORIE cache #' -#' @param db_path Optional override for the database path. +#' @param db_path Optional path to a SQLite file (default backend). +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return A data.frame with columns \code{table} and \code{rows}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") +#' \donttest{ +#' db <- tempfile(fileext = ".db") +#' morie_cache_store(data.frame(x = 1:3), "demo", db_path = db) +#' morie_cache_list(db_path = db) +#' file.remove(db) #' } #' @export -morie_cache_list <- function(db_path = NULL) { - con <- morie_db_connect(db_path) - on.exit(DBI::dbDisconnect(con), add = TRUE) - tables <- DBI::dbListTables(con) +morie_cache_list <- function(db_path = NULL, con = NULL) { + h <- .morie_db_handle(con, db_path) + if (h$close) on.exit(DBI::dbDisconnect(h$con), add = TRUE) + tables <- DBI::dbListTables(h$con) if (length(tables) == 0L) { return(data.frame(table = character(), rows = integer())) } + # Quote identifiers per the backend's own conventions so this works on + # SQLite ([tbl]), PostgreSQL ("tbl"), MariaDB (`tbl`), DuckDB ("tbl"), ... + # COUNT(*) returns integer on SQLite/PostgreSQL but double on DuckDB; cast + # so the vapply FUN.VALUE matches across backends. counts <- vapply(tables, function(t) { - DBI::dbGetQuery(con, sprintf("SELECT COUNT(*) AS n FROM [%s]", t))$n + q <- DBI::dbQuoteIdentifier(h$con, t) + as.integer(DBI::dbGetQuery(h$con, sprintf("SELECT COUNT(*) AS n FROM %s", q))$n) }, integer(1)) data.frame(table = tables, rows = counts, stringsAsFactors = FALSE) } @@ -146,15 +349,17 @@ morie_cache_list <- function(db_path = NULL) { #' #' @param path Path to a CSV or RDS file. #' @param table_name Name for the cached table. -#' @param db_path Optional override for the database path. +#' @param db_path Optional path to a SQLite file (default backend). +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return Number of rows cached (invisible). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' tdir <- tempfile("morie-cache-") +#' dir.create(tdir) +#' f <- file.path(tdir, "demo.csv") +#' write.csv(data.frame(x = 1:3, y = 4:6), f, row.names = FALSE) +#' morie_cache_file(f, "demo", db_path = file.path(tdir, "cache.db")) #' @export -morie_cache_file <- function(path, table_name, db_path = NULL) { +morie_cache_file <- function(path, table_name, db_path = NULL, con = NULL) { ext <- tolower(tools::file_ext(path)) data <- if (ext == "rds") { readRDS(path) @@ -163,7 +368,7 @@ morie_cache_file <- function(path, table_name, db_path = NULL) { } else { stop("Unsupported format: ", ext, call. = FALSE) } - morie_cache_store(data, table_name, db_path) + morie_cache_store(data, table_name, db_path = db_path, con = con) } #' Load CPADS data: local files -> cache -> CKAN API @@ -175,19 +380,19 @@ morie_cache_file <- function(path, table_name, db_path = NULL) { #' \item CKAN API fetch (requires internet) #' } #' -#' @param db_path Optional override for the database path. +#' @param db_path Optional path to a SQLite/DuckDB file (default backend). #' @param use_ckan Logical; if TRUE and data not found locally or in cache, #' attempt to fetch from the CKAN API. +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return A data.frame with canonical CPADS columns. #' @examples #' \dontrun{ -#' # Needs the CPADS PUMF (local file, cache, or a live CKAN fetch), -#' # so it cannot run inside an offline R CMD check. -#' cpads <- morie_load_cpads(use_ckan = TRUE) -#' if (!is.null(cpads)) head(cpads) +#' # Needs the CPADS PUMF (local file, cache, or a live CKAN fetch). +#' cpads <- morie_load_cpads(use_ckan = TRUE) +#' if (!is.null(cpads)) head(cpads) #' } #' @export -morie_load_cpads <- function(db_path = NULL, use_ckan = TRUE) { +morie_load_cpads <- function(db_path = NULL, use_ckan = TRUE, con = NULL) { # 1. Local files. local_paths <- c( "data/datasets/oc/CPADS/2021-2022/cpads-2021-2022-pumf2.csv", @@ -198,14 +403,16 @@ morie_load_cpads <- function(db_path = NULL, use_ckan = TRUE) { message("Loading CPADS from local: ", p) ext <- tolower(tools::file_ext(p)) data <- if (ext == "rds") readRDS(p) else utils::read.csv(p, stringsAsFactors = FALSE) - # Cache it for next time. - tryCatch(morie_cache_store(data, "cpads_canonical", db_path), error = function(e) NULL) + tryCatch( + morie_cache_store(data, "cpads_canonical", db_path = db_path, con = con), + error = function(e) NULL + ) return(data) } } - # 2. SQLite cache. - cached <- morie_cache_load("cpads_canonical", db_path) + # 2. DBI cache (DuckDB by default; SQLite if older cache exists). + cached <- morie_cache_load("cpads_canonical", db_path = db_path, con = con) if (!is.null(cached)) { message("Loading CPADS from cache (", nrow(cached), " rows)") return(cached) @@ -214,7 +421,7 @@ morie_load_cpads <- function(db_path = NULL, use_ckan = TRUE) { # 3. CKAN API. if (use_ckan) { message("Fetching CPADS from CKAN API...") - data <- morie_fetch_ckan("cpads", db_path = db_path) + data <- morie_fetch_ckan("cpads", db_path = db_path, con = con) return(data) } @@ -224,19 +431,28 @@ morie_load_cpads <- function(db_path = NULL, use_ckan = TRUE) { #' Fetch data from the CKAN API and cache it #' #' @param dataset_key One of \code{"cpads"}, \code{"csads"}, \code{"csus"}. -#' @param limit Max records to fetch. +#' @param limit Maximum records to fetch. The CKAN datastore caps a +#' single request at 32000 rows, so larger resources are paged through +#' with `offset`; the default reads the entire resource. #' @param db_path Optional override for the database path. +#' @param resource_id Optional CKAN datastore resource id. When supplied +#' (e.g. from \code{morie_dataset_catalog()$ckan_resource_id}) it is used +#' directly, so any catalogued dataset can be fetched without a built-in +#' database; \code{dataset_key} then only labels the cache table. +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return A data.frame. #' @examples #' \dontrun{ -#' # Requires network access. Fetches the first 5000 rows of the -#' # Canadian Postsecondary Alcohol and Drug Use Survey from the -#' # Government of Canada CKAN datastore: -#' cpads <- morie_fetch_ckan(dataset_key = "cpads", limit = 5000L) -#' nrow(cpads) +#' # Requires network access. Fetches the first 5000 rows of the +#' # Canadian Postsecondary Alcohol and Drug Use Survey from the +#' # Government of Canada CKAN datastore: +#' cpads <- morie_fetch_ckan(dataset_key = "cpads", limit = 5000L) +#' nrow(cpads) #' } #' @export -morie_fetch_ckan <- function(dataset_key = "cpads", limit = 32000L, db_path = NULL) { +morie_fetch_ckan <- function(dataset_key = "cpads", limit = Inf, + db_path = NULL, resource_id = NULL, + con = NULL) { ckan_base <- "https://open.canada.ca/data/en/api/3/action/datastore_search" resource_ids <- list( @@ -251,11 +467,19 @@ morie_fetch_ckan <- function(dataset_key = "cpads", limit = 32000L, db_path = NU csus = "https://open.canada.ca/data/api/action/package_show?id=65e2d45e-efc6-4c29-9a9b-db59bc96aa0e" ) - rid <- resource_ids[[dataset_key]] - if (is.null(rid)) { + # A catalog-supplied resource id is used directly; otherwise fall back + # to the survey-keyed lookup, then to package-metadata resolution. + rid <- if (!is.null(resource_id) && nzchar(resource_id)) { + resource_id + } else { + resource_ids[[dataset_key]] + } + if (is.null(rid) || !nzchar(rid)) { # Resolve from package metadata. meta_url <- metadata_urls[[dataset_key]] - if (is.null(meta_url)) stop("Unknown dataset: ", dataset_key, call. = FALSE) + if (is.null(meta_url)) { + stop("Unknown dataset / no CKAN resource id: ", dataset_key, call. = FALSE) + } meta_raw <- readLines(url(meta_url), warn = FALSE) meta <- jsonlite::fromJSON(paste(meta_raw, collapse = "")) resources <- meta$result$resources @@ -263,13 +487,43 @@ morie_fetch_ckan <- function(dataset_key = "cpads", limit = 32000L, db_path = NU rid <- if (length(csv_idx) > 0) resources$id[csv_idx[1]] else resources$id[1] } - api_url <- sprintf("%s?resource_id=%s&limit=%d", ckan_base, rid, as.integer(limit)) - message("Fetching from: ", api_url) - raw <- readLines(url(api_url), warn = FALSE) - payload <- jsonlite::fromJSON(paste(raw, collapse = "")) - records <- payload$result$records + # CKAN datastore_search caps a single request at 32000 rows, so page + # through with `offset` until the whole resource (or `limit`) is read. + cap <- as.integer(min(limit, .Machine$integer.max)) + page <- min(cap, 32000L) + message("Fetching from CKAN datastore: resource_id=", rid) + pages <- list() + fetched <- 0L + total <- NA_real_ + repeat { + api_url <- sprintf( + "%s?resource_id=%s&limit=%d&offset=%d", + ckan_base, rid, page, fetched + ) + raw <- readLines(url(api_url), warn = FALSE) + payload <- jsonlite::fromJSON(paste(raw, collapse = "")) + recs <- payload$result$records + if (is.null(recs) || NROW(recs) == 0L) break + pages[[length(pages) + 1L]] <- recs + fetched <- fetched + NROW(recs) + if (is.na(total)) { + total <- if (!is.null(payload$result$total)) { + as.numeric(payload$result$total) + } else { + fetched + } + } + if (fetched >= total || fetched >= cap) break + } + records <- if (length(pages) == 0L) { + NULL + } else if (length(pages) == 1L) { + pages[[1L]] + } else { + do.call(rbind, pages) + } - if (is.null(records) || nrow(records) == 0L) { + if (is.null(records) || NROW(records) == 0L) { stop("CKAN returned 0 records for ", dataset_key, call. = FALSE) } @@ -278,9 +532,12 @@ morie_fetch_ckan <- function(dataset_key = "cpads", limit = 32000L, db_path = NU # Cache. table_name <- paste0(dataset_key, "_raw") - tryCatch(morie_cache_store(records, table_name, db_path), error = function(e) { - message("Warning: could not cache: ", conditionMessage(e)) - }) + tryCatch( + morie_cache_store(records, table_name, db_path = db_path, con = con), + error = function(e) { + message("Warning: could not cache: ", conditionMessage(e)) + } + ) records } @@ -295,67 +552,100 @@ morie_fetch_ckan <- function(dataset_key = "cpads", limit = 32000L, db_path = NU key_lower <- tolower(gsub("-", "_", key)) # Exact match on new short keys. idx <- which(catalog$key == key_lower) - if (length(idx) == 1L) return(catalog$key[idx]) + if (length(idx) == 1L) { + return(catalog$key[idx]) + } # Backward-compat: resolve old long keys to new short keys. if (key_lower %in% names(.OLD_TO_SHORT)) { short <- .OLD_TO_SHORT[[key_lower]] idx <- which(catalog$key == short) - if (length(idx) == 1L) return(catalog$key[idx]) + if (length(idx) == 1L) { + return(catalog$key[idx]) + } } # Substring match on keys. idx <- which(grepl(key_lower, catalog$key, fixed = TRUE)) - if (length(idx) >= 1L) return(catalog$key[idx[1L]]) + if (length(idx) >= 1L) { + return(catalog$key[idx[1L]]) + } # Substring match on dataset names. idx <- which(grepl(key_lower, tolower(catalog$name), fixed = TRUE)) - if (length(idx) >= 1L) return(catalog$key[idx[1L]]) + if (length(idx) >= 1L) { + return(catalog$key[idx[1L]]) + } NULL } #' Load a dataset by catalog key #' -#' Resolution: SQLite cache -> local file ingest -> CKAN API -> error. -#' Supports fuzzy matching: \code{morie_load_dataset("cpads_2021")} resolves -#' to \code{oc_cpads_2021}. +#' Resolution tiers, tried in order: built-in DB -> user cache -> local +#' file -> CKAN datastore -> direct download URL -> ArcGIS layer -> +#' error. Supports fuzzy matching: \code{morie_load_dataset("cpads_2021")} +#' resolves to \code{ocp21}. #' #' @param key Dataset catalog key (or fuzzy match). -#' @param db_path Optional override for the database path. +#' @param db_path Optional path to a SQLite/DuckDB file (default backend). +#' @param refresh If \code{TRUE}, bypass the built-in database and the +#' user cache (and, for remotely-backed datasets, the local file) and +#' re-fetch from the remote source, overwriting the cached copy. Use +#' this to pick up time-to-time updates to a dataset. +#' @param con Optional pre-opened DBI connection for the user cache +#' (overrides `db_path`). The built-in DB read is always SQLite-based +#' and is unaffected by `con`. #' @return A data.frame. #' @examples #' \dontrun{ -#' df <- morie_load_dataset("ocp21") # CPADS 2021-2022 -#' nrow(df) +#' df <- morie_load_dataset("ocp21") # CPADS 2021-2022 (default DuckDB cache) +#' df <- morie_load_dataset("ocp21", refresh = TRUE) # force re-fetch +#' +#' # PostgreSQL cache (run a server first): +#' # con <- DBI::dbConnect(RPostgres::Postgres(), +#' # host = "localhost", dbname = "morie", user = "...") +#' # df <- morie_load_dataset("ocp21", con = con) #' } +#' @seealso \code{\link{morie_fetch}}, \code{\link{morie_ckan_search}} #' @export -morie_load_dataset <- function(key, db_path = NULL) { +morie_load_dataset <- function(key, db_path = NULL, refresh = FALSE, + con = NULL) { matched <- .fuzzy_match_key(key) if (is.null(matched)) { stop("Unknown dataset key: '", key, "'. See morie_dataset_catalog().", call. = FALSE) } catalog <- morie_dataset_catalog() entry <- catalog[catalog$key == matched, ] - - # 1. Built-in database (ships with package). - builtin_path <- tryCatch(morie_builtin_db(), error = function(e) NULL) - if (!is.null(builtin_path) && requireNamespace("DBI", quietly = TRUE) && + has <- function(col) col %in% names(entry) && nzchar(entry[[col]]) + has_remote <- has("ckan_resource_id") || has("download_url") || + has("arcgis_url") + + if (!refresh) { + # 1. Built-in database (ships with package). + builtin_path <- tryCatch(morie_builtin_db(), error = function(e) NULL) + if (!is.null(builtin_path) && requireNamespace("DBI", quietly = TRUE) && requireNamespace("RSQLite", quietly = TRUE)) { - bcon <- DBI::dbConnect(RSQLite::SQLite(), dbname = builtin_path) - on.exit(DBI::dbDisconnect(bcon), add = TRUE) - if (DBI::dbExistsTable(bcon, entry$table_name)) { - data <- DBI::dbReadTable(bcon, entry$table_name) - message("Loaded ", matched, " from built-in DB (", nrow(data), " rows)") - return(data) + bcon <- DBI::dbConnect(RSQLite::SQLite(), dbname = builtin_path) + on.exit(DBI::dbDisconnect(bcon), add = TRUE) + if (DBI::dbExistsTable(bcon, entry$table_name)) { + data <- DBI::dbReadTable(bcon, entry$table_name) + message( + "Loaded ", matched, " from built-in DB (", nrow(data), + " rows)" + ) + return(data) + } } - } - # 2. User cache. - cached <- morie_cache_load(entry$table_name, db_path) - if (!is.null(cached)) { - message("Loaded ", matched, " from cache (", nrow(cached), " rows)") - return(cached) + # 2. User cache (DuckDB by default; SQLite if older cache exists). + cached <- morie_cache_load(entry$table_name, db_path = db_path, con = con) + if (!is.null(cached)) { + message("Loaded ", matched, " from cache (", nrow(cached), " rows)") + return(cached) + } } - # 2. Local file. - if (file.exists(entry$local_path)) { + # 3. Local file. Skipped on refresh when a remote source exists, so a + # refresh re-pulls from the authoritative remote rather than a stale + # on-disk copy; for local-only datasets the file remains the source. + if (file.exists(entry$local_path) && !(refresh && has_remote)) { message("Ingesting ", matched, " from local: ", entry$local_path) ext <- tolower(tools::file_ext(entry$local_path)) data <- if (ext == "csv") { @@ -368,38 +658,71 @@ morie_load_dataset <- function(key, db_path = NULL) { } else { stop("Unsupported format: ", ext, call. = FALSE) } - morie_cache_store(data, entry$table_name, db_path) + morie_cache_store(data, entry$table_name, db_path = db_path, con = con) return(data) } - # 3. CKAN API. - if (nzchar(entry$ckan_resource_id)) { - message("Fetching ", matched, " from CKAN API...") - data <- morie_fetch_ckan(entry$survey, db_path = db_path) + # 4. CKAN datastore -- resolved directly from the catalog resource id, + # matching the Python load_dataset() design (no built-in DB needed). + if (has("ckan_resource_id")) { + message("Fetching ", matched, " from the CKAN datastore ...") + data <- morie_fetch_ckan( + dataset_key = matched, + resource_id = entry$ckan_resource_id, + db_path = db_path, + con = con + ) + morie_cache_store(data, entry$table_name, db_path = db_path, con = con) + return(data) + } + + # 5. Direct download URL -- open-data files not exposed through the CKAN + # datastore (direct CSV/XLSX, or a file bundled inside a .zip archive). + if (has("download_url")) { + message("Downloading ", matched, " from ", entry$download_url, " ...") + zm <- if ("zip_member" %in% names(entry)) entry$zip_member else "" + is_zip <- grepl("\\.zip$", entry$download_url, ignore.case = TRUE) + data <- morie_fetch(entry$download_url, + format = if (is_zip) "zip" else "auto", + zip_member = zm + ) + morie_cache_store(data, entry$table_name, db_path = db_path, con = con) return(data) } - stop("Dataset '", matched, "' not found locally, in cache, or via CKAN.\n", - "Run: Rscript data-raw/ingest_datasets.R --only ", matched, call. = FALSE) + # 6. ArcGIS FeatureServer / MapServer layer (e.g. TPS crime open data). + if (has("arcgis_url")) { + message("Querying ", matched, " from the ArcGIS layer ...") + data <- morie_fetch_arcgis(entry$arcgis_url) + morie_cache_store(data, entry$table_name, db_path = db_path, con = con) + return(data) + } + + stop("Dataset '", matched, "' not found locally, in cache, via CKAN, ", + "via a direct download URL, or via an ArcGIS layer.\n", + "Run: Rscript data-raw/ingest_datasets.R --only ", matched, + call. = FALSE + ) } #' List all datasets with cache status #' -#' @param db_path Optional override for the database path. +#' @param db_path Optional path to a SQLite/DuckDB file (default backend). +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return A data.frame with columns: key, name, source, survey, year, type, #' cached (logical), rows (integer or NA). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_list_datasets() #' @export -morie_list_datasets <- function(db_path = NULL) { +morie_list_datasets <- function(db_path = NULL, con = NULL) { catalog <- morie_dataset_catalog() - cached_tables <- tryCatch({ - cl <- morie_cache_list(db_path) - stats::setNames(cl$rows, cl$table) - }, error = function(e) stats::setNames(integer(0), character(0))) + cached_tables <- tryCatch( + { + cl <- morie_cache_list(db_path = db_path, con = con) + stats::setNames(cl$rows, cl$table) + }, + error = function(e) stats::setNames(integer(0), character(0)) + ) catalog$cached <- catalog$table_name %in% names(cached_tables) catalog$rows <- as.integer(cached_tables[catalog$table_name]) @@ -413,7 +736,8 @@ morie_list_datasets <- function(db_path = NULL) { #' @examples #' # Use a real catalog key (run `morie_dataset_catalog()$key` to list them): #' info <- morie_dataset_info("ocp21") -#' info$source; info$year +#' info$source +#' info$year #' # Fuzzy match works for partial / forgiving keys: #' morie_dataset_info("cpads")$key #' @export @@ -434,10 +758,7 @@ morie_dataset_info <- function(key) { #' If \code{NULL}, lists all available userguides. #' @return File path string, or character vector of filenames. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_userguide() #' @export morie_userguide <- function(name = NULL) { if (is.null(name)) { @@ -456,20 +777,24 @@ morie_userguide <- function(name = NULL) { #' @param survey One of \code{"csads_2021"}, \code{"csads_2023"}, #' \code{"csus_2019"}, \code{"csus_2023"}, or \code{"all"} (default). #' @param limit Max records per CKAN request (default 32000). -#' @param db_path Optional override for cache database path. +#' @param db_path Optional path to a SQLite/DuckDB file (default backend). +#' @param con Optional pre-opened DBI connection (overrides `db_path`). #' @return Invisibly, the number of CSV files successfully downloaded. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") +#' \donttest{ +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' } #' @export -morie_download_bootstrap <- function(survey = "all", limit = 32000L, db_path = NULL) { +morie_download_bootstrap <- function(survey = "all", limit = 32000L, + db_path = NULL, con = NULL) { + # Current short catalog keys (see morie_dataset_catalog()); the older + # oc___bootstrap long keys are no longer in the catalog. bootstrap_keys <- list( - csads_2021 = "oc_csads_2021_bootstrap", - csads_2023 = "oc_csads_2023_bootstrap", - csus_2019 = "oc_csus_2019_bootstrap", - csus_2023 = "oc_csus_2023_bootstrap" + csads_2021 = "ocs22bt", + csads_2023 = "ocs24bt", + csus_2019 = "cu20bt", + csus_2023 = "cu23bt" ) if (survey == "all") { targets <- unlist(bootstrap_keys, use.names = FALSE) @@ -489,7 +814,7 @@ morie_download_bootstrap <- function(survey = "all", limit = 32000L, db_path = N # Try local file first. if (file.exists(entry$local_path)) { message("Ingesting ", key, " from local file: ", entry$local_path) - morie_cache_file(entry$local_path, entry$table_name, db_path) + morie_cache_file(entry$local_path, entry$table_name, db_path = db_path, con = con) message(" OK: cached ", entry$table_name) next } @@ -497,13 +822,19 @@ morie_download_bootstrap <- function(survey = "all", limit = 32000L, db_path = N # Try CKAN. if (nzchar(entry$ckan_resource_id)) { message("Downloading ", key, " from CKAN (limit=", limit, ")...") - tryCatch({ - data <- morie_fetch_ckan(entry$survey, limit = limit, db_path = db_path) - morie_cache_store(data, entry$table_name, db_path) - message(" OK: ", nrow(data), " rows cached as ", entry$table_name) - }, error = function(e) { - message(" ERROR: ", conditionMessage(e)) - }) + tryCatch( + { + data <- morie_fetch_ckan(entry$survey, + limit = limit, + db_path = db_path, con = con + ) + morie_cache_store(data, entry$table_name, db_path = db_path, con = con) + message(" OK: ", nrow(data), " rows cached as ", entry$table_name) + }, + error = function(e) { + message(" ERROR: ", conditionMessage(e)) + } + ) } else { message(" ", key, ": no CKAN resource ID. Download CSV manually to ", entry$local_path) } diff --git a/r-package/morie/R/dataset_catalog.R b/r-package/morie/R/dataset_catalog.R index d76fd1131e..e1e46e2f79 100644 --- a/r-package/morie/R/dataset_catalog.R +++ b/r-package/morie/R/dataset_catalog.R @@ -17,257 +17,372 @@ #' Keys match the Python DATASET_CATALOG in \code{data.py} exactly. #' Use \code{\link{morie_load_dataset}} to load by key. #' -#' @return A data.frame with 36 rows (one per dataset) and columns: +#' @return A data.frame with 44 rows (one per dataset) and columns: #' key, name, source, survey, year, format, type, large_file, -#' local_path, table_name, ckan_resource_id. +#' local_path, table_name, ckan_resource_id, download_url, zip_member. +#' The \code{download_url} / \code{zip_member} columns are empty for +#' datasets reachable through the SQLite cache or the CKAN datastore. #' @examples #' cat <- morie_dataset_catalog() #' nrow(cat) #' head(cat[, c("key", "name", "source", "year")]) #' # Find Ontario carceral datasets: -#' cat[grepl("OTIS|Ontario", paste(cat$source, cat$survey)), -#' c("key", "year")] +#' cat[ +#' grepl("OTIS|Ontario", paste(cat$source, cat$survey)), +#' c("key", "year") +#' ] #' @export morie_dataset_catalog <- function() { entries <- list( # -- OpenCanada (oc) PUMF microdata -- - list(key = "ocp21", name = "CPADS 2021-2022 PUMF", - source = "oc", survey = "cpads", year = "2021-2022", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CPADS/2021-2022/cpads-2021-2022-pumf2.csv", - table_name = "ocp21", - ckan_resource_id = "d2639429-c304-45a6-90b3-770562f4d46d"), - list(key = "occ22", name = "CCS 2018-2022 PUMF", - source = "oc", survey = "ccs", year = "2018-2022", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CCS/2018-2022/ccs_pumf_2018to2022_final.csv", - table_name = "occ22", ckan_resource_id = ""), - list(key = "occ23", name = "CCS 2023 PUMF", - source = "oc", survey = "ccs", year = "2023", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CCS/2023/ccs_2023_pumf.csv", - table_name = "occ23", ckan_resource_id = ""), - list(key = "occ24", name = "CCS 2024 PUMF", - source = "oc", survey = "ccs", year = "2024", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CCS/2024/ccs_pumf_2024-002.csv", - table_name = "occ24", ckan_resource_id = ""), - list(key = "ocs22mf", name = "CSADS 2021-2022 PUMF", - source = "oc", survey = "csads", year = "2021-2022", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CSADS/2021-2022/csads202122pumf.csv", - table_name = "ocs22mf", - ckan_resource_id = "f6761337-47e9-455a-a3c4-ea8516aa634f"), - list(key = "ocs22bt", name = "CSADS 2021-2022 Bootstrap", - source = "oc", survey = "csads", year = "2021-2022", - format = "csv", type = "bootstrap", large_file = TRUE, - local_path = "data/datasets/oc/CSADS/2021-2022/csads202122bootstrap.csv", - table_name = "ocs22bt", - ckan_resource_id = "ebdc36e1-910d-4685-81a3-6acfe44729bc"), - list(key = "ocs24mf", name = "CSADS 2023-2024 PUMF", - source = "oc", survey = "csads", year = "2023-2024", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CSADS/2023-2024/csads202324pumf.csv", - table_name = "ocs24mf", - ckan_resource_id = "81a3adf0-61d0-4691-afba-588fa5f563da"), - list(key = "ocs24bt", name = "CSADS 2023-2024 Bootstrap", - source = "oc", survey = "csads", year = "2023-2024", - format = "csv", type = "bootstrap", large_file = TRUE, - local_path = "data/datasets/oc/CSADS/2023-2024/csads202324bootstrap.csv", - table_name = "ocs24bt", ckan_resource_id = ""), - list(key = "cu20mf", name = "CSUS 2019-2020 PUMF", - source = "oc", survey = "csus", year = "2019-2020", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CSUS/2019-2020/CADS201920pumf.csv", - table_name = "cu20mf", ckan_resource_id = ""), - list(key = "cu20bt", name = "CSUS 2019-2020 Bootstrap", - source = "oc", survey = "csus", year = "2019-2020", - format = "csv", type = "bootstrap", large_file = TRUE, - local_path = "data/datasets/oc/CSUS/2019-2020/CADS201920bsw.csv", - table_name = "cu20bt", ckan_resource_id = ""), - list(key = "cu23mf", name = "CSUS 2023 PUMF", - source = "oc", survey = "csus", year = "2023", - format = "csv", type = "pumf", large_file = FALSE, - local_path = "data/datasets/oc/CSUS/2023/csus2023_pumf_final.csv", - table_name = "cu23mf", ckan_resource_id = ""), - list(key = "cu23bt", name = "CSUS 2023 Bootstrap", - source = "oc", survey = "csus", year = "2023", - format = "csv", type = "bootstrap", large_file = TRUE, - local_path = "data/datasets/oc/CSUS/2023/csus2023_pumf_bwt.csv", - table_name = "cu23bt", ckan_resource_id = ""), + list( + key = "ocp21", name = "CPADS 2021-2022 PUMF", + source = "oc", survey = "cpads", year = "2021-2022", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CPADS/2021-2022/cpads-2021-2022-pumf2.csv", + table_name = "ocp21", + ckan_resource_id = "d2639429-c304-45a6-90b3-770562f4d46d" + ), + list( + key = "occ22", name = "CCS 2018-2022 PUMF", + source = "oc", survey = "ccs", year = "2018-2022", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CCS/2018-2022/ccs_pumf_2018to2022_final.csv", + table_name = "occ22", ckan_resource_id = "262e6163-ba41-4562-bd2b-8996e738b1d4" + ), + list( + key = "occ23", name = "CCS 2023 PUMF", + source = "oc", survey = "ccs", year = "2023", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CCS/2023/ccs_2023_pumf.csv", + table_name = "occ23", ckan_resource_id = "100c5845-664e-4c66-be15-3625ce236d8b" + ), + list( + key = "occ24", name = "CCS 2024 PUMF", + source = "oc", survey = "ccs", year = "2024", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CCS/2024/ccs_pumf_2024-002.csv", + table_name = "occ24", ckan_resource_id = "420925be-399a-473b-8be3-26875a1c132a" + ), + list( + key = "ocs22mf", name = "CSADS 2021-2022 PUMF", + source = "oc", survey = "csads", year = "2021-2022", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CSADS/2021-2022/csads202122pumf.csv", + table_name = "ocs22mf", + ckan_resource_id = "f6761337-47e9-455a-a3c4-ea8516aa634f" + ), + list( + key = "ocs22bt", name = "CSADS 2021-2022 Bootstrap", + source = "oc", survey = "csads", year = "2021-2022", + format = "csv", type = "bootstrap", large_file = TRUE, + local_path = "data/datasets/oc/CSADS/2021-2022/csads202122bootstrap.csv", + table_name = "ocs22bt", + ckan_resource_id = "ebdc36e1-910d-4685-81a3-6acfe44729bc" + ), + list( + key = "ocs24mf", name = "CSADS 2023-2024 PUMF", + source = "oc", survey = "csads", year = "2023-2024", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CSADS/2023-2024/csads202324pumf.csv", + table_name = "ocs24mf", + ckan_resource_id = "81a3adf0-61d0-4691-afba-588fa5f563da" + ), + list( + key = "ocs24bt", name = "CSADS 2023-2024 Bootstrap", + source = "oc", survey = "csads", year = "2023-2024", + format = "csv", type = "bootstrap", large_file = TRUE, + local_path = "data/datasets/oc/CSADS/2023-2024/csads202324bootstrap.csv", + table_name = "ocs24bt", ckan_resource_id = "", download_url = "https://open.canada.ca/data/dataset/1f15ca45-8bfd-4f9c-9ec6-2c0c440e69c2/resource/58682536-1325-405a-83f0-7b1284b4f717/download/202324-csads-ecade-pumf-fmgd-bootstrap-weights-poids-de-bootstrap.csv" + ), + list( + key = "cu20mf", name = "CSUS 2019-2020 PUMF", + source = "oc", survey = "csus", year = "2019-2020", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CSUS/2019-2020/CADS201920pumf.csv", + table_name = "cu20mf", ckan_resource_id = "", download_url = "https://www150.statcan.gc.ca/n1/pub/13-25-0005/2021001/CSV.zip", zip_member = "CADS201920pumf.csv" + ), + list( + key = "cu20bt", name = "CSUS 2019-2020 Bootstrap", + source = "oc", survey = "csus", year = "2019-2020", + format = "csv", type = "bootstrap", large_file = TRUE, + local_path = "data/datasets/oc/CSUS/2019-2020/CADS201920bsw.csv", + table_name = "cu20bt", ckan_resource_id = "", download_url = "https://www150.statcan.gc.ca/n1/pub/13-25-0005/2021001/CSV.zip", zip_member = "CADS201920bsw.csv" + ), + list( + key = "cu23mf", name = "CSUS 2023 PUMF", + source = "oc", survey = "csus", year = "2023", + format = "csv", type = "pumf", large_file = FALSE, + local_path = "data/datasets/oc/CSUS/2023/csus2023_pumf_final.csv", + table_name = "cu23mf", ckan_resource_id = "c2c1795b-4501-49ba-9dd1-5b8360cc3b2e" + ), + list( + key = "cu23bt", name = "CSUS 2023 Bootstrap", + source = "oc", survey = "csus", year = "2023", + format = "csv", type = "bootstrap", large_file = TRUE, + local_path = "data/datasets/oc/CSUS/2023/csus2023_pumf_bwt.csv", + table_name = "cu23bt", ckan_resource_id = "", download_url = "https://open.canada.ca/data/dataset/65e2d45e-efc6-4c29-9a9b-db59bc96aa0e/resource/7d19d47a-5f42-4447-b735-aa4d677ad5ed/download/csus2023_pumf_bwt.csv" + ), # -- HealthInfobase (hib) aggregates -- - list(key = "hibp", name = "CPADS Aggregate", - source = "hib", survey = "cpads", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CPADS/CPADS.csv", - table_name = "hibp", ckan_resource_id = ""), - list(key = "hibsa", name = "CSADS Provinces", - source = "hib", survey = "csads", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSADS/provinces.csv", - table_name = "hibsa", ckan_resource_id = ""), - list(key = "hibsb", name = "CSADS Trends", - source = "hib", survey = "csads", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSADS/trends.csv", - table_name = "hibsb", ckan_resource_id = ""), - list(key = "hibua", name = "CSUS Alcohol", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Alcohol.csv", - table_name = "hibua", ckan_resource_id = ""), - list(key = "hibub", name = "CSUS Cannabis", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Cannabis.csv", - table_name = "hibub", ckan_resource_id = ""), - list(key = "hibuc", name = "CSUS Smoking & Vaping", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Cigarette smoking and vaping.csv", - table_name = "hibuc", ckan_resource_id = ""), - list(key = "hibud", name = "CSUS Illegal Substances", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Illegal substances.csv", - table_name = "hibud", ckan_resource_id = ""), - list(key = "hibue", name = "CSUS Opioids", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Opioids.csv", - table_name = "hibue", ckan_resource_id = ""), - list(key = "hibuf", name = "CSUS OTC Products", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Over the counter products.csv", - table_name = "hibuf", ckan_resource_id = ""), - list(key = "hibug", name = "CSUS Polysubstance", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Polysubstance.csv", - table_name = "hibug", ckan_resource_id = ""), - list(key = "hibuh", name = "CSUS Sedatives", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Sedatives.csv", - table_name = "hibuh", ckan_resource_id = ""), - list(key = "hibui", name = "CSUS Stimulants", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Stimulants.csv", - table_name = "hibui", ckan_resource_id = ""), - list(key = "hibuj", name = "CSUS Substance Use Harms", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Substance use harms.csv", - table_name = "hibuj", ckan_resource_id = ""), - list(key = "hibuk", name = "CSUS Treatment", - source = "hib", survey = "csus", year = "", - format = "csv", type = "aggregate", large_file = FALSE, - local_path = "data/datasets/hib/CSUS/Treatment.csv", - table_name = "hibuk", ckan_resource_id = ""), + list( + key = "hibp", name = "CPADS Aggregate", + source = "hib", survey = "cpads", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CPADS/CPADS.csv", + table_name = "hibp", ckan_resource_id = "" + ), + list( + key = "hibsa", name = "CSADS Provinces", + source = "hib", survey = "csads", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSADS/provinces.csv", + table_name = "hibsa", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csads/downloadable/CSADS-data.zip", zip_member = "provinces.csv" + ), + list( + key = "hibsb", name = "CSADS Trends", + source = "hib", survey = "csads", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSADS/trends.csv", + table_name = "hibsb", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csads/downloadable/CSADS-data.zip", zip_member = "trends.csv" + ), + list( + key = "hibua", name = "CSUS Alcohol", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Alcohol.csv", + table_name = "hibua", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Alcohol.csv" + ), + list( + key = "hibub", name = "CSUS Cannabis", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Cannabis.csv", + table_name = "hibub", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Cannabis.csv" + ), + list( + key = "hibuc", name = "CSUS Smoking & Vaping", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Cigarette smoking and vaping.csv", + table_name = "hibuc", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Cigarette smoking and vaping.csv" + ), + list( + key = "hibud", name = "CSUS Illegal Substances", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Illegal substances.csv", + table_name = "hibud", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Illegal substances.csv" + ), + list( + key = "hibue", name = "CSUS Opioids", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Opioids.csv", + table_name = "hibue", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Opioids.csv" + ), + list( + key = "hibuf", name = "CSUS OTC Products", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Over the counter products.csv", + table_name = "hibuf", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Over the counter products.csv" + ), + list( + key = "hibug", name = "CSUS Polysubstance", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Polysubstance.csv", + table_name = "hibug", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Polysubstance.csv" + ), + list( + key = "hibuh", name = "CSUS Sedatives", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Sedatives.csv", + table_name = "hibuh", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Sedatives.csv" + ), + list( + key = "hibui", name = "CSUS Stimulants", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Stimulants.csv", + table_name = "hibui", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Stimulants.csv" + ), + list( + key = "hibuj", name = "CSUS Substance Use Harms", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Substance use harms.csv", + table_name = "hibuj", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Substance use harms.csv" + ), + list( + key = "hibuk", name = "CSUS Treatment", + source = "hib", survey = "csus", year = "", + format = "csv", type = "aggregate", large_file = FALSE, + local_path = "data/datasets/hib/CSUS/Treatment.csv", + table_name = "hibuk", ckan_resource_id = "", download_url = "https://health-infobase.canada.ca/src/data/csus/CADS_data.zip", zip_member = "Treatment.csv" + ), # -- CIHI (cihi) indicator library -- - list(key = "cihidt", name = "CIHI All Indicators", - source = "cihi", survey = "indicators", year = "", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/indicator-library-all-indicator-data-en.xlsx", - table_name = "cihidt", ckan_resource_id = ""), - list(key = "cihi820a", name = "CIHI 820: Substance Use Harm", - source = "cihi", survey = "indicators", year = "", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/820/820-hospital-stays-for-harm-caused-by-substance-use-data-table-en.xlsx", - table_name = "cihi820a", ckan_resource_id = ""), - list(key = "cihi820b", name = "CIHI 820: Substance Use Breakdown 2024-2025", - source = "cihi", survey = "indicators", year = "2024-2025", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/820/820-hospital-stays-harm-due-to-substance-use-breakdown-2024-2025-data-tables-en-additional.xlsx", - table_name = "cihi820b", ckan_resource_id = ""), - list(key = "cihi849", name = "CIHI 849: Alcohol Use Harm", - source = "cihi", survey = "indicators", year = "", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/849/849-hospital-stays-for-harm-caused-by-alcohol-use-data-table-en.xlsx", - table_name = "cihi849", ckan_resource_id = ""), - list(key = "cihi885a", name = "CIHI 885: Youth Services", - source = "cihi", survey = "indicators", year = "", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/885/885-youth-age-12-to-25-who-accessed-integrated-youth-services-for-mental-health-substance-use-and-well-being-support-data-table-en.xlsx", - table_name = "cihi885a", ckan_resource_id = ""), - list(key = "cihi885b", name = "CIHI 885: Youth Sites 2024-2025", - source = "cihi", survey = "indicators", year = "2024-2025", - format = "xlsx", type = "indicator", large_file = FALSE, - local_path = "data/datasets/cihi/885/885-number-integrated-youth-services-sites-2024-2025-data-tables-en-additonal.xlsx", - table_name = "cihi885b", ckan_resource_id = ""), + list( + key = "cihidt", name = "CIHI All Indicators", + source = "cihi", survey = "indicators", year = "", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/indicator-library-all-indicator-data-en.xlsx", + table_name = "cihidt", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/indicator-library-all-indicator-data-en.xlsx" + ), + list( + key = "cihi820a", name = "CIHI 820: Substance Use Harm", + source = "cihi", survey = "indicators", year = "", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/820/820-hospital-stays-for-harm-caused-by-substance-use-data-table-en.xlsx", + table_name = "cihi820a", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/data-file/820-hospital-stays-for-harm-caused-by-substance-use-data-table-en.xlsx" + ), + list( + key = "cihi820b", name = "CIHI 820: Substance Use Breakdown 2024-2025", + source = "cihi", survey = "indicators", year = "2024-2025", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/820/820-hospital-stays-harm-due-to-substance-use-breakdown-2024-2025-data-tables-en-additional.xlsx", + table_name = "cihi820b", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/hospital-stays-harm-due-to-substance-use-breakdown-2024-2025-data-tables-en.xlsx" + ), + list( + key = "cihi849", name = "CIHI 849: Alcohol Use Harm", + source = "cihi", survey = "indicators", year = "", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/849/849-hospital-stays-for-harm-caused-by-alcohol-use-data-table-en.xlsx", + table_name = "cihi849", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/data-file/849-hospital-stays-for-harm-caused-by-alcohol-use-data-table-en.xlsx" + ), + list( + key = "cihi885a", name = "CIHI 885: Youth Services", + source = "cihi", survey = "indicators", year = "", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/885/885-youth-age-12-to-25-who-accessed-integrated-youth-services-for-mental-health-substance-use-and-well-being-support-data-table-en.xlsx", + table_name = "cihi885a", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/data-file/885-youth-age-12-to-25-who-accessed-integrated-youth-services-for-mental-health-substance-use-and-well-being-support-data-table-en.xlsx" + ), + list( + key = "cihi885b", name = "CIHI 885: Youth Sites 2024-2025", + source = "cihi", survey = "indicators", year = "2024-2025", + format = "xlsx", type = "indicator", large_file = FALSE, + local_path = "data/datasets/cihi/885/885-number-integrated-youth-services-sites-2024-2025-data-tables-en-additonal.xlsx", + table_name = "cihi885b", ckan_resource_id = "", download_url = "https://www.cihi.ca/sites/default/files/document/integrated-youth-services-sites-2024-2025-data-tables-en.xlsx" + ), # VSR Research Data - list(key = "mapq", name = "MAPQ: Modified Attitudes on Psychedelics Questionnaire", - source = "vsr", survey = "mapq", year = "2026", - format = "xlsx", type = "psychometric", large_file = FALSE, - local_path = "data/datasets/vsr/TKARONTOMAPQ.xlsx", - table_name = "mapq", ckan_resource_id = ""), - list(key = "otis", name = "OTIS: Ontario Restrictive Confinement 2023-2025", - source = "vsr", survey = "otis", year = "2023-2025", - format = "rdata", type = "correctional", large_file = FALSE, - local_path = "data/cache/correctional_stats_report_environment1b.RData", - table_name = "otis", ckan_resource_id = ""), - list(key = "otisexp", name = "OTIS Expanded (1.9M placement records)", - source = "vsr", survey = "otis", year = "2023-2025", - format = "rds", type = "correctional", large_file = TRUE, - local_path = "data/cache/dt_expanded.rds", - table_name = "otisexp", ckan_resource_id = ""), - list(key = "otisfin", name = "OTIS Complete Analysis Environment", - source = "vsr", survey = "otis", year = "2023-2025", - format = "rdata", type = "correctional", large_file = TRUE, - local_path = "data/cache/finne_env.RData", - table_name = "otisfin", ckan_resource_id = ""), + list( + key = "mapq", name = "MAPQ: Modified Attitudes on Psychedelics Questionnaire", + source = "vsr", survey = "mapq", year = "2026", + format = "xlsx", type = "psychometric", large_file = FALSE, + local_path = "data/datasets/vsr/TKARONTOMAPQ.xlsx", + table_name = "mapq", ckan_resource_id = "" + ), + list( + key = "otis", name = "OTIS: Ontario Restrictive Confinement 2023-2025", + source = "vsr", survey = "otis", year = "2023-2025", + format = "rdata", type = "correctional", large_file = FALSE, + local_path = "data/cache/correctional_stats_report_environment1b.RData", + table_name = "otis", ckan_resource_id = "" + ), + list( + key = "otisexp", name = "OTIS Expanded (1.9M placement records)", + source = "vsr", survey = "otis", year = "2023-2025", + format = "rds", type = "correctional", large_file = TRUE, + local_path = "data/cache/dt_expanded.rds", + table_name = "otisexp", ckan_resource_id = "" + ), + list( + key = "otisfin", name = "OTIS Complete Analysis Environment", + source = "vsr", survey = "otis", year = "2023-2025", + format = "rdata", type = "correctional", large_file = TRUE, + local_path = "data/cache/finne_env.RData", + table_name = "otisfin", ckan_resource_id = "" + ), # -- OTIS public release per-table CSVs (used by mrm_otis_*) -- # CKAN IDs from data.ontario.ca/dataset/data-on-inmates-in-ontario - list(key = "otisa01", name = "OTIS a01: Restrictive Confinement - Detailed Dataset", - source = "otis", survey = "a01", year = "2023-2025", - format = "csv", type = "correctional", large_file = FALSE, - local_path = "data/datasets/OTIS/a01_restrictive_confinement_detailed_dataset.csv", - table_name = "otisa01", - ckan_resource_id = "5a0c5804-a055-4031-9743-73f556e43bb4"), - list(key = "otisb01", name = "OTIS b01: Segregation - Detailed Dataset", - source = "otis", survey = "b01", year = "2023-2025", - format = "csv", type = "correctional", large_file = FALSE, - local_path = "data/datasets/OTIS/b01_segregation_detailed_dataset.csv", - table_name = "otisb01", - ckan_resource_id = "406e6d90-d568-4553-8ca7-bc9f90e133b9"), - list(key = "otisb09", name = "OTIS b09: Individuals in Segregation - Number of Placements", - source = "otis", survey = "b09", year = "2023-2025", - format = "csv", type = "correctional", large_file = FALSE, - local_path = "data/datasets/OTIS/b09_individuals_in_segregation_number_of_times_in_segregation.csv", - table_name = "otisb09", - ckan_resource_id = "df24e943-d52b-43a8-a10e-a3cc906e26bb"), - list(key = "otisc11", name = "OTIS c11: Individuals in Segregation/RC by Aggregate Length", - source = "otis", survey = "c11", year = "2023-2025", - format = "csv", type = "correctional", large_file = FALSE, - local_path = "data/datasets/OTIS/c11_individuals_in_segregation_and_restrictive_confinement_aggregate_lengths.csv", - table_name = "otisc11", - ckan_resource_id = "9c7b74a5-53ad-4ef0-a7a6-97772cd01c55"), + list( + key = "otisa01", name = "OTIS a01: Restrictive Confinement - Detailed Dataset", + source = "otis", survey = "a01", year = "2023-2025", + format = "csv", type = "correctional", large_file = FALSE, + local_path = "data/datasets/OTIS/a01_restrictive_confinement_detailed_dataset.csv", + table_name = "otisa01", + ckan_resource_id = "5a0c5804-a055-4031-9743-73f556e43bb4" + ), + list( + key = "otisb01", name = "OTIS b01: Segregation - Detailed Dataset", + source = "otis", survey = "b01", year = "2023-2025", + format = "csv", type = "correctional", large_file = FALSE, + local_path = "data/datasets/OTIS/b01_segregation_detailed_dataset.csv", + table_name = "otisb01", + ckan_resource_id = "406e6d90-d568-4553-8ca7-bc9f90e133b9" + ), + list( + key = "otisb09", name = "OTIS b09: Individuals in Segregation - Number of Placements", + source = "otis", survey = "b09", year = "2023-2025", + format = "csv", type = "correctional", large_file = FALSE, + local_path = "data/datasets/OTIS/b09_individuals_in_segregation_number_of_times_in_segregation.csv", + table_name = "otisb09", + ckan_resource_id = "df24e943-d52b-43a8-a10e-a3cc906e26bb" + ), + list( + key = "otisc11", name = "OTIS c11: Individuals in Segregation/RC by Aggregate Length", + source = "otis", survey = "c11", year = "2023-2025", + format = "csv", type = "correctional", large_file = FALSE, + local_path = "data/datasets/OTIS/c11_individuals_in_segregation_and_restrictive_confinement_aggregate_lengths.csv", + table_name = "otisc11", + ckan_resource_id = "9c7b74a5-53ad-4ef0-a7a6-97772cd01c55" + ), # -- SIU public case-level data (used by mrm_siu_*) -- - list(key = "siu", name = "Ontario SIU: case-level investigations (2014-present)", - source = "vsr", survey = "siu", year = "2014-present", - format = "csv", type = "oversight", large_file = FALSE, - local_path = "data/datasets/vsr/SIU.csv", - table_name = "siu", ckan_resource_id = ""), + list( + key = "siu", name = "Ontario SIU: case-level investigations (2014-present)", + source = "vsr", survey = "siu", year = "2014-present", + format = "csv", type = "oversight", large_file = FALSE, + local_path = "data/datasets/vsr/SIU.csv", + table_name = "siu", ckan_resource_id = "" + ), # -- TPS per-category public crime events (used by mrm_tps_*) -- - list(key = "tpsassault", name = "TPS Assault open-data events 2014-present", - source = "tps", survey = "assault", year = "2014-present", - format = "csv", type = "crime", large_file = TRUE, - local_path = "data/datasets/TPS/Assault/CSV", - table_name = "tpsassault", ckan_resource_id = ""), - list(key = "tpshomicides", name = "TPS Homicides open-data events 2014-present", - source = "tps", survey = "homicides", year = "2014-present", - format = "csv", type = "crime", large_file = FALSE, - local_path = "data/datasets/TPS/Homicides/CSV", - table_name = "tpshomicides", ckan_resource_id = ""), - list(key = "tpsshootings", name = "TPS Shootings and Firearm Discharges 2014-present", - source = "tps", survey = "shootings", year = "2014-present", - format = "csv", type = "crime", large_file = TRUE, - local_path = "data/datasets/TPS/ShootingAndFirearmDiscarges/CSV", - table_name = "tpsshootings", ckan_resource_id = "") + list( + key = "tpsassault", name = "TPS Assault open-data events 2014-present", + source = "tps", survey = "assault", year = "2014-present", + format = "csv", type = "crime", large_file = TRUE, + local_path = "data/datasets/TPS/Assault/CSV", + table_name = "tpsassault", ckan_resource_id = "", + arcgis_url = paste0( + "https://services.arcgis.com/S9th0jAJ7bqgIRjw", + "/arcgis/rest/services/Assault_Open_Data", + "/FeatureServer/0" + ) + ), + list( + key = "tpshomicides", name = "TPS Homicides open-data events 2004-present", + source = "tps", survey = "homicides", year = "2004-present", + format = "csv", type = "crime", large_file = FALSE, + local_path = "data/datasets/TPS/Homicides/CSV", + table_name = "tpshomicides", ckan_resource_id = "", + arcgis_url = paste0( + "https://services.arcgis.com/S9th0jAJ7bqgIRjw", + "/arcgis/rest/services", + "/Homicides_Open_Data_ASR_RC_TBL_002", + "/FeatureServer/0" + ) + ), + list( + key = "tpsshootings", name = "TPS Shootings and Firearm Discharges 2004-present", + source = "tps", survey = "shootings", year = "2004-present", + format = "csv", type = "crime", large_file = TRUE, + local_path = "data/datasets/TPS/ShootingAndFirearmDiscarges/CSV", + table_name = "tpsshootings", ckan_resource_id = "", + arcgis_url = paste0( + "https://services.arcgis.com/S9th0jAJ7bqgIRjw", + "/arcgis/rest/services", + "/Shooting_and_Firearm_Discharges_Open_Data", + "/FeatureServer/0" + ) + ) ) - do.call(rbind, lapply(entries, as.data.frame, stringsAsFactors = FALSE)) + # Tolerate entries that omit optional columns (download_url, + # zip_member): fill any missing column with "" before binding. + all_cols <- unique(unlist(lapply(entries, names))) + do.call(rbind, lapply(entries, function(e) { + for (col in setdiff(all_cols, names(e))) e[[col]] <- "" + as.data.frame(e[all_cols], stringsAsFactors = FALSE) + })) } # Backward-compatible mapping: old long keys -> new short keys diff --git a/r-package/morie/R/dataset_profile.R b/r-package/morie/R/dataset_profile.R index 2aad6c719b..dd91e3ca95 100644 --- a/r-package/morie/R/dataset_profile.R +++ b/r-package/morie/R/dataset_profile.R @@ -14,21 +14,29 @@ #' `c("binary", "nominal", "ordinal", "interval", "ratio")`. #' @export #' @examples -#' infer_measurement_level(c(0, 1, 1, 0)) # "binary" -#' infer_measurement_level(factor(c("a", "b", "c"))) # "nominal" -#' infer_measurement_level(ordered(c("low", "med", "high"))) # "ordinal" -#' infer_measurement_level(c(1.2, 3.4, 5.6)) # "ratio" -#' infer_measurement_level(c(-1.5, 0.0, 2.3)) # "interval" -infer_measurement_level <- function(x) { - if (is.logical(x)) return("binary") +#' morie_infer_measurement_level(c(0, 1, 1, 0)) # "binary" +#' morie_infer_measurement_level(factor(c("a", "b", "c"))) # "nominal" +#' morie_infer_measurement_level(ordered(c("low", "med", "high"))) # "ordinal" +#' morie_infer_measurement_level(c(1.2, 3.4, 5.6)) # "ratio" +#' morie_infer_measurement_level(c(-1.5, 0.0, 2.3)) # "interval" +morie_infer_measurement_level <- function(x) { + if (is.logical(x)) { + return("binary") + } uniq <- unique(stats::na.omit(x)) - if (length(uniq) == 2L && all(uniq %in% c(0, 1, "0", "1", TRUE, FALSE))) + if (length(uniq) == 2L && all(uniq %in% c(0, 1, "0", "1", TRUE, FALSE))) { return("binary") - if (is.ordered(x)) return("ordinal") - if (is.factor(x) || is.character(x)) + } + if (is.ordered(x)) { + return("ordinal") + } + if (is.factor(x) || is.character(x)) { return(if (length(uniq) == 2L) "binary" else "nominal") + } if (is.numeric(x)) { - if (length(uniq) == 2L) return("binary") + if (length(uniq) == 2L) { + return("binary") + } return(if (all(x >= 0, na.rm = TRUE)) "ratio" else "interval") } "nominal" @@ -50,30 +58,32 @@ infer_measurement_level <- function(x) { #' } #' @export #' @examples -#' p <- profile_dataset(iris) +#' p <- morie_profile_dataset(iris) #' p$columns$Species #' p$columns$Sepal.Length -profile_dataset <- function(df) { +morie_profile_dataset <- function(df) { if (!is.data.frame(df)) stop("df must be a data.frame.", call. = FALSE) cols <- lapply(names(df), function(nm) { x <- df[[nm]] base <- list( name = nm, dtype = paste(class(x), collapse = "/"), - measurement_level = infer_measurement_level(x), + measurement_level = morie_infer_measurement_level(x), n_missing = sum(is.na(x)), n_unique = length(unique(stats::na.omit(x))) ) if (is.numeric(x)) { - qs <- stats::quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = TRUE, - names = FALSE) + qs <- stats::quantile(x, + probs = c(0.25, 0.5, 0.75), na.rm = TRUE, + names = FALSE + ) base$mean <- mean(x, na.rm = TRUE) - base$sd <- stats::sd(x, na.rm = TRUE) - base$min <- min(x, na.rm = TRUE) - base$max <- max(x, na.rm = TRUE) - base$q25 <- qs[[1L]] - base$q50 <- qs[[2L]] - base$q75 <- qs[[3L]] + base$sd <- stats::sd(x, na.rm = TRUE) + base$min <- min(x, na.rm = TRUE) + base$max <- max(x, na.rm = TRUE) + base$q25 <- qs[[1L]] + base$q50 <- qs[[2L]] + base$q75 <- qs[[3L]] } base }) @@ -84,51 +94,77 @@ profile_dataset <- function(df) { #' Suggest an analysis plan from a dataset profile #' #' Mirrors the Python `morie.suggest_analysis_plan()`. Inspects the output -#' of [profile_dataset()] and returns plain-English recommendations for +#' of [morie_profile_dataset()] and returns plain-English recommendations for #' candidate analyses. #' -#' @param profile A list returned by [profile_dataset()]. +#' @param profile A list returned by [morie_profile_dataset()]. #' #' @return Character vector of suggestion strings, one per recommendation. #' @export #' @examples -#' suggest_analysis_plan(profile_dataset(iris)) -suggest_analysis_plan <- function(profile) { - if (!is.list(profile) || is.null(profile$columns)) - stop("profile must be a list returned by profile_dataset().", call. = FALSE) +#' morie_suggest_analysis_plan(morie_profile_dataset(iris)) +morie_suggest_analysis_plan <- function(profile) { + if (!is.list(profile) || is.null(profile$columns)) { + stop("profile must be a list returned by morie_profile_dataset().", call. = FALSE) + } suggestions <- character(0) cols <- profile$columns levels <- vapply(cols, `[[`, character(1L), "measurement_level") - n_binary <- sum(levels == "binary") + n_binary <- sum(levels == "binary") n_numeric <- sum(levels %in% c("ratio", "interval")) n_nominal <- sum(levels == "nominal") n_ordinal <- sum(levels == "ordinal") - if (n_binary >= 1L && n_numeric >= 1L) - suggestions <- c(suggestions, - "Binary outcome + numeric predictors detected. Logistic regression (glm with family=binomial) is appropriate.") - if (n_binary >= 2L) - suggestions <- c(suggestions, - "Two or more binary variables. Consider chi-square test, Fisher's exact test, or risk-difference / odds-ratio CIs.") - if (n_numeric >= 2L) - suggestions <- c(suggestions, - "Multiple numeric variables. Consider linear regression (lm), Pearson correlation, or principal-components analysis.") - if (n_nominal >= 1L && n_numeric >= 1L) - suggestions <- c(suggestions, - "Nominal grouping + numeric outcome. Consider one-way ANOVA, Kruskal-Wallis, or per-group descriptives.") - if (n_ordinal >= 1L) - suggestions <- c(suggestions, - "Ordinal variable detected. Consider Spearman or Kendall's tau correlation, or proportional-odds models.") + if (n_binary >= 1L && n_numeric >= 1L) { + suggestions <- c( + suggestions, + "Binary outcome + numeric predictors detected. Logistic regression (glm with family=binomial) is appropriate." + ) + } + if (n_binary >= 2L) { + suggestions <- c( + suggestions, + paste0( + "Two or more binary variables. Consider chi-square test, ", + "Fisher.s exact test, or risk-difference / odds-ratio CIs." + ) + ) + } + if (n_numeric >= 2L) { + suggestions <- c( + suggestions, + paste0( + "Multiple numeric variables. Consider linear regression (lm), ", + "Pearson correlation, or principal-components analysis." + ) + ) + } + if (n_nominal >= 1L && n_numeric >= 1L) { + suggestions <- c( + suggestions, + "Nominal grouping + numeric outcome. Consider one-way ANOVA, Kruskal-Wallis, or per-group descriptives." + ) + } + if (n_ordinal >= 1L) { + suggestions <- c( + suggestions, + "Ordinal variable detected. Consider Spearman or Kendall's tau correlation, or proportional-odds models." + ) + } any_missing <- any(vapply(cols, function(c) c$n_missing > 0L, logical(1L))) - if (any_missing) - suggestions <- c(suggestions, - "Missing values present. Consider multiple imputation (e.g. mice) or complete-case sensitivity analysis.") + if (any_missing) { + suggestions <- c( + suggestions, + "Missing values present. Consider multiple imputation (e.g. mice) or complete-case sensitivity analysis." + ) + } - if (length(suggestions) == 0L) + if (length(suggestions) == 0L) { suggestions <- "No standard analysis pattern triggered. Inspect the profile manually." + } suggestions } diff --git a/r-package/morie/R/dbscl.R b/r-package/morie/R/dbscl.R index ae9aaf19b0..2c83ef5693 100644 --- a/r-package/morie/R/dbscl.R +++ b/r-package/morie/R/dbscl.R @@ -11,20 +11,17 @@ #' @return Named list: estimate, labels, n_clusters, n_noise, #' core_sample_indices, eps, min_samples, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_dbscan_clustering(x = rnorm(50)) #' @export -dbscan_clustering <- function(x, eps = 0.5, min_samples = 5L, - metric = "euclidean") { +morie_dbscan_clustering <- function(x, eps = 0.5, min_samples = 5L, + metric = "euclidean") { if (!requireNamespace("dbscan", quietly = TRUE)) { - stop("Function 'dbscan_clustering' requires package 'dbscan'. Install with install.packages('dbscan').") + stop("Function 'morie_dbscan_clustering' requires package 'dbscan'. Install with install.packages('dbscan').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) fit <- dbscan::dbscan(x, eps = eps, minPts = min_samples) - labels <- fit$cluster # 0 = noise in dbscan; sklearn uses -1 + labels <- fit$cluster # 0 = noise in dbscan; sklearn uses -1 labels_sk <- ifelse(labels == 0L, -1L, labels - 1L) n_clusters <- length(unique(labels_sk[labels_sk >= 0L])) n_noise <- sum(labels_sk == -1L) @@ -36,7 +33,7 @@ dbscan_clustering <- function(x, eps = 0.5, min_samples = 5L, labels = as.integer(labels_sk), n_clusters = as.integer(n_clusters), n_noise = as.integer(n_noise), - core_sample_indices = as.integer(core_idx - 1L), # 0-indexed + core_sample_indices = as.integer(core_idx - 1L), # 0-indexed eps = as.numeric(eps), min_samples = as.integer(min_samples), n = nrow(x), diff --git a/r-package/morie/R/dccmd.R b/r-package/morie/R/dccmd.R index 5d34d0b813..19aa936e8d 100644 --- a/r-package/morie/R/dccmd.R +++ b/r-package/morie/R/dccmd.R @@ -1,5 +1,36 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: DCC(1,1) two-step Gaussian negative log-likelihood for the +# base-R fallback path. Extracted from the morie_dcc_multivariate_garch() +# optimiser closure so the parameter-domain and non-positive-determinant +# guards are directly unit-testable. `Q_bar` is the unconditional +# correlation, `n` the sample size, `Z` the standardised residuals. +.dccmd_negll <- function(p, Q_bar, n, Z) { + a <- p[1] + b <- p[2] + if (a < 0 || b < 0 || a + b >= 0.9999) { + return(1e10) + } + Q <- Q_bar + ll <- 0 + for (t in seq_len(n)) { + d <- sqrt(pmax(diag(Q), 1e-12)) + R <- Q / outer(d, d) + ld <- determinant(R, logarithm = TRUE) + # determinant() reports sign = +1 for a singular matrix (det == 0), + # so a sign test alone misses it; modulus == -Inf catches singularity + # and prevents the solve(R) below from erroring on a non-invertible R. + if (ld$sign <= 0 || !is.finite(ld$modulus)) { + return(1e10) + } + Rinv <- solve(R) + zt <- Z[t, ] + ll <- ll + 0.5 * (ld$modulus + sum(zt * (Rinv %*% zt)) - sum(zt^2)) + Q <- (1 - a - b) * Q_bar + a * tcrossprod(zt) + b * Q + } + as.numeric(ll) +} + #' DCC multivariate GARCH (Engle 2002) #' #' Two-step DCC(1,1) on a panel of return series. @@ -8,68 +39,69 @@ #' @return Named list with \code{a, b, unconditional_correlation, #' conditional_correlation, conditional_variance, loglik, n, k, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_dcc_multivariate_garch(x = matrix(rnorm(150), 50, 3)) #' @export -dcc_multivariate_garch <- function(x) { - X <- as.matrix(x); if (nrow(X) < ncol(X)) X <- t(X) - n <- nrow(X); k <- ncol(X) +morie_dcc_multivariate_garch <- function(x) { + X <- as.matrix(x) + if (nrow(X) < ncol(X)) X <- t(X) + n <- nrow(X) + k <- ncol(X) if (n < 30 || k < 2) stop("Need n>=30, k>=2.") if (requireNamespace("rmgarch", quietly = TRUE) && - requireNamespace("rugarch", quietly = TRUE)) { - uspec <- rugarch::multispec(replicate(k, rugarch::ugarchspec( - variance.model = list(model = "sGARCH", garchOrder = c(1, 1)), - mean.model = list(armaOrder = c(0, 0), include.mean = FALSE)), - simplify = FALSE)) - dccspec <- rmgarch::dccspec(uspec = uspec, dccOrder = c(1, 1), - distribution = "mvnorm") - fit <- rmgarch::dccfit(dccspec, data = X) - # `coef`, `sigma`, `likelihood` are S4 generics (stats / rugarch); - # rmgarch provides methods but does not export the generics themselves, - # so namespace-qualifying them as `rmgarch::*` trips an R CMD check - # WARNING. Use the canonical generic-defining packages directly. - p <- stats::coef(fit) - sig_mat <- as.matrix(stats::sigma(fit)) - return(list(a = unname(p["[Joint]dcca1"]), - b = unname(p["[Joint]dccb1"]), - unconditional_correlation = cor(X), - conditional_correlation = rmgarch::rcor(fit), - conditional_variance = sig_mat^2, - loglik = as.numeric(rugarch::likelihood(fit)), - n = n, k = k, - method = "DCC(1,1) via rmgarch")) + requireNamespace("rugarch", quietly = TRUE)) { + # The rmgarch DCC path relies on S4 `coef`/`sigma` methods whose + # dispatch and slot layout vary across rmgarch versions. Wrap it so + # that any API mismatch degrades gracefully to the base-R DCC below + # rather than hard-failing. + res <- tryCatch( + { + uspec <- rugarch::multispec(replicate(k, rugarch::ugarchspec( + variance.model = list(model = "sGARCH", garchOrder = c(1, 1)), + mean.model = list(armaOrder = c(0, 0), include.mean = FALSE) + ), + simplify = FALSE + )) + dccspec <- rmgarch::dccspec( + uspec = uspec, dccOrder = c(1, 1), + distribution = "mvnorm" + ) + fit <- rmgarch::dccfit(dccspec, data = X) + p <- stats::coef(fit) + sig_mat <- as.matrix(stats::sigma(fit)) + list( + a = unname(p["[Joint]dcca1"]), + b = unname(p["[Joint]dccb1"]), + unconditional_correlation = cor(X), + conditional_correlation = rmgarch::rcor(fit), + conditional_variance = sig_mat^2, + loglik = as.numeric(rugarch::likelihood(fit)), + n = n, k = k, + method = "DCC(1,1) via rmgarch" + ) + }, + error = function(e) NULL + ) + if (!is.null(res)) { + return(res) + } } # Fallback: two-step EWMA-marginal + closed-form DCC update. - H <- matrix(NA_real_, n, k); Z <- matrix(NA_real_, n, k) + H <- matrix(NA_real_, n, k) + Z <- matrix(NA_real_, n, k) for (j in seq_len(k)) { rj <- X[, j] - mean(X[, j]) - g <- garch_fit(rj) + g <- morie_garch_fit(rj) H[, j] <- g$conditional_variance Z[, j] <- rj / sqrt(H[, j] + 1e-12) } Q_bar <- crossprod(Z) / n - neg_ll <- function(p) { - a <- p[1]; b <- p[2] - if (a < 0 || b < 0 || a + b >= 0.9999) return(1e10) - Q <- Q_bar; ll <- 0 - for (t in seq_len(n)) { - d <- sqrt(pmax(diag(Q), 1e-12)) - R <- Q / outer(d, d) - ld <- determinant(R, logarithm = TRUE) - if (ld$sign <= 0) return(1e10) - Rinv <- solve(R) - zt <- Z[t, ] - ll <- ll + 0.5 * (ld$modulus + sum(zt * (Rinv %*% zt)) - sum(zt^2)) - Q <- (1 - a - b) * Q_bar + a * tcrossprod(zt) + b * Q - } - as.numeric(ll) - } + neg_ll <- function(p) .dccmd_negll(p, Q_bar, n, Z) opt <- nlminb(c(0.02, 0.95), neg_ll, - lower = c(1e-6, 1e-6), - upper = c(0.5, 0.999)) - a <- opt$par[1]; b <- opt$par[2] + lower = c(1e-6, 1e-6), + upper = c(0.5, 0.999) + ) + a <- opt$par[1] + b <- opt$par[2] Q <- Q_bar R_path <- array(NA_real_, c(n, k, k)) for (t in seq_len(n)) { @@ -77,11 +109,13 @@ dcc_multivariate_garch <- function(x) { R_path[t, , ] <- Q / outer(d, d) Q <- (1 - a - b) * Q_bar + a * tcrossprod(Z[t, ]) + b * Q } - list(a = a, b = b, - unconditional_correlation = Q_bar, - conditional_correlation = R_path, - conditional_variance = H, - loglik = -opt$objective, - n = n, k = k, - method = "DCC(1,1) two-step Gaussian MLE (base R)") + list( + a = a, b = b, + unconditional_correlation = Q_bar, + conditional_correlation = R_path, + conditional_variance = H, + loglik = -opt$objective, + n = n, k = k, + method = "DCC(1,1) two-step Gaussian MLE (base R)" + ) } diff --git a/r-package/morie/R/diffu.R b/r-package/morie/R/diffu.R index 67d1b1e2ac..f24b2b265a 100644 --- a/r-package/morie/R/diffu.R +++ b/r-package/morie/R/diffu.R @@ -15,18 +15,16 @@ #' r_stability, n_steps, alpha, method)}. #' @references Crank (1975), Mathematics of Diffusion. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_diffu_heat_diffusion(T0 = rep(0, 10)) #' @export -diffu_heat_diffusion <- function(T0, alpha = 0.01, dx = 0.1, dt = 0.01, +morie_diffu_heat_diffusion <- function(T0, alpha = 0.01, dx = 0.1, dt = 0.01, n_steps = 100L) { T0 <- as.numeric(T0) if (length(T0) < 3L) stop("T0 must have at least 3 points.") r <- alpha * dt / (dx^2) - if (r > 0.5) + if (r > 0.5) { stop(sprintf("CFL violated: r=%.4f > 0.5. Reduce dt or increase dx.", r)) + } n_x <- length(T0) Tt <- T0 history <- matrix(0, n_steps + 1L, n_x) @@ -38,10 +36,12 @@ diffu_heat_diffusion <- function(T0, alpha = 0.01, dx = 0.1, dt = 0.01, Tt <- Tn history[k + 1L, ] <- Tt } - list(value = mean(Tt), T_final = Tt, T_initial = T0, - history = history, r_stability = r, - n_steps = as.integer(n_steps), alpha = alpha, - method = "1D Heat Diffusion (forward Euler)") + list( + value = mean(Tt), T_final = Tt, T_initial = T0, + history = history, r_stability = r, + n_steps = as.integer(n_steps), alpha = alpha, + method = "1D Heat Diffusion (forward Euler)" + ) } #' DDPM forward (noising) process @@ -62,18 +62,17 @@ diffu_heat_diffusion <- function(T0, alpha = 0.01, dx = 0.1, dt = 0.01, #' @return Named list \code{(x_t, estimate, noise, alpha_bar, beta, method)}. #' @references Ho, Jain & Abbeel (2020), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -diffu_diffusion_forward <- function(x0, t, betas = NULL, num_steps = 1000L, +morie_diffu_diffusion_forward <- function(x0, t, betas = NULL, num_steps = 1000L, noise = NULL, seed = 0L) { x0 <- as.numeric(x0) if (is.null(betas)) betas <- seq(1e-4, 0.02, length.out = num_steps) betas <- as.numeric(betas) - if (t < 1L || t > length(betas)) + if (t < 1L || t > length(betas)) { stop(sprintf("t must be in [1, %d], got %d", length(betas), t)) + } alphas <- 1 - betas alpha_bar <- prod(alphas[1:t]) if (is.null(noise)) { @@ -82,12 +81,14 @@ diffu_diffusion_forward <- function(x0, t, betas = NULL, num_steps = 1000L, } noise <- as.numeric(noise) x_t <- sqrt(alpha_bar) * x0 + sqrt(1 - alpha_bar) * noise - list(x_t = x_t, estimate = x_t, noise = noise, - alpha_bar = alpha_bar, beta = betas[t], - method = "DDPM forward diffusion") + list( + x_t = x_t, estimate = x_t, noise = noise, + alpha_bar = alpha_bar, beta = betas[t], + method = "DDPM forward diffusion" + ) } -#' @rdname diffu_heat_diffusion +#' @rdname morie_diffu_heat_diffusion #' @keywords internal #' @export -diffusion_forward <- diffu_heat_diffusion +morie_diffusion_forward <- morie_diffu_heat_diffusion diff --git a/r-package/morie/R/dimrd.R b/r-package/morie/R/dimrd.R index b1f37da79f..b8db4d2527 100644 --- a/r-package/morie/R/dimrd.R +++ b/r-package/morie/R/dimrd.R @@ -11,35 +11,43 @@ #' @return Named list with `n_dims`, `eigenvalues`, `threshold`, #' `scree_gap`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' dimrd(x = rnorm(50)) #' @export dimrd <- function(x, threshold = 1) { M <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) - n <- nrow(M); m <- ncol(M) + n <- nrow(M) + m <- ncol(M) is_sym <- (n == m) && isTRUE(all.equal(M, t(M))) - S <- if (is_sym) (M + t(M)) / 2 - else { - if (m < 2L) - return(list(n_dims = 0L, eigenvalues = numeric(0), - threshold = threshold, scree_gap = NA_integer_, - method = "dimensionality_test")) - S0 <- suppressWarnings(stats::cor(M)) - S0[is.na(S0)] <- 0; diag(S0) <- 1; S0 - } - ev <- sort(eigen((S + t(S)) / 2, symmetric = TRUE, - only.values = TRUE)$values, decreasing = TRUE) + S <- if (is_sym) { + (M + t(M)) / 2 + } else { + if (m < 2L) { + return(list( + n_dims = 0L, eigenvalues = numeric(0), + threshold = threshold, scree_gap = NA_integer_, + method = "morie_dimensionality_test" + )) + } + S0 <- suppressWarnings(stats::cor(M)) + S0[is.na(S0)] <- 0 + diag(S0) <- 1 + S0 + } + ev <- sort(eigen((S + t(S)) / 2, + symmetric = TRUE, + only.values = TRUE + )$values, decreasing = TRUE) n_dims <- sum(ev > threshold) gaps <- -diff(ev) scree <- if (length(gaps)) which.max(gaps) else 0L - list(n_dims = as.integer(n_dims), eigenvalues = ev, - threshold = threshold, scree_gap = as.integer(scree), - method = "dimensionality_test") + list( + n_dims = as.integer(n_dims), eigenvalues = ev, + threshold = threshold, scree_gap = as.integer(scree), + method = "morie_dimensionality_test" + ) } #' @keywords internal #' @rdname dimrd #' @export -dimensionality_test <- dimrd +morie_dimensionality_test <- dimrd diff --git a/r-package/morie/R/dlgen.R b/r-package/morie/R/dlgen.R index f70c49deb5..22160024af 100644 --- a/r-package/morie/R/dlgen.R +++ b/r-package/morie/R/dlgen.R @@ -17,23 +17,27 @@ #' @return list(estimate, y_hat, beta, W1, b1, w2, b2, se, n, method). #' @references Montesinos Lopez Ch 12. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_deep_learning_genomic( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -deep_learning_genomic <- function(x, y, markers, hidden = 16, - n_epochs = 200, lr = 1e-2, - l2 = 1e-3, seed = 0, - deterministic_seed = NULL) { +morie_deep_learning_genomic <- function(x, y, markers, hidden = 16, + n_epochs = 200, lr = 1e-2, + l2 = 1e-3, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("dlgen", deterministic_seed) } else { set.seed(seed) } - y <- as.numeric(y); n <- length(y) - M <- as.matrix(markers); m <- ncol(M) - M_mu <- colMeans(M); M_sd <- apply(M, 2, stats::sd); M_sd[M_sd == 0] <- 1 + y <- as.numeric(y) + n <- length(y) + M <- as.matrix(markers) + m <- ncol(M) + M_mu <- colMeans(M) + M_sd <- apply(M, 2, stats::sd) + M_sd[M_sd == 0] <- 1 Ms <- sweep(sweep(M, 2, M_mu), 2, M_sd, "/") W1 <- matrix(stats::rnorm(m * hidden, 0, 1 / sqrt(m)), m, hidden) b1 <- rep(0, hidden) @@ -52,19 +56,24 @@ deep_learning_genomic <- function(x, y, markers, hidden = 16, dz1 <- dh * (1 - h^2) dW1 <- crossprod(Ms, dz1) + l2 * W1 db1 <- colSums(dz1) - W1 <- W1 - lr * dW1; b1 <- b1 - lr * db1 - w2 <- w2 - lr * dw2; b2 <- b2 - lr * db2 + W1 <- W1 - lr * dW1 + b1 <- b1 - lr * db1 + w2 <- w2 - lr * dw2 + b2 <- b2 - lr * db2 losses[ep] <- mean(resid^2) } - z1 <- sweep(Ms %*% W1, 2, b1, "+"); h <- tanh(z1) + z1 <- sweep(Ms %*% W1, 2, b1, "+") + h <- tanh(z1) y_hat <- as.numeric(h %*% w2) + b2 resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, beta = numeric(0), - W1 = W1, b1 = b1, w2 = w2, b2 = b2, - loss_curve = losses, se = sqrt(mean(resid^2)), - n = n, method = "MLP-1H base-R") + list( + estimate = mean(y_hat), y_hat = y_hat, beta = numeric(0), + W1 = W1, b1 = b1, w2 = w2, b2 = b2, + loss_curve = losses, se = sqrt(mean(resid^2)), + n = n, method = "MLP-1H base-R" + ) } # CANONICAL TEST # set.seed(6); M <- matrix(rnorm(100), 20, 5) -# y <- M[,1] + 0.3*rnorm(20); deep_learning_genomic(rep(0,20), y, M, seed=6) +# y <- M[,1] + 0.3*rnorm(20); morie_deep_learning_genomic(rep(0,20), y, M, seed=6) diff --git a/r-package/morie/R/drpfw.R b/r-package/morie/R/drpfw.R index b364676b92..00f8973db7 100644 --- a/r-package/morie/R/drpfw.R +++ b/r-package/morie/R/drpfw.R @@ -17,19 +17,18 @@ #' @return Named list \code{(y, estimate, mask, p, kept_fraction, method)}. #' @references Srivastava et al. (2014), JMLR 15:1929-1958. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_drpfw_dropout_forward(x = rnorm(50)) #' @export -drpfw_dropout_forward <- function(x, p = 0.5, seed = 0L, training = TRUE, +morie_drpfw_dropout_forward <- function(x, p = 0.5, seed = 0L, training = TRUE, deterministic_seed = NULL) { if (p < 0 || p >= 1) stop(sprintf("p must be in [0, 1), got %g", p)) x <- as.array(x) if (!training || p == 0) { - return(list(y = x, estimate = x, mask = array(1, dim(x)), p = p, - kept_fraction = 1.0, - method = "Dropout (pass-through)")) + return(list( + y = x, estimate = x, mask = array(1, dim(x)), p = p, + kept_fraction = 1.0, + method = "Dropout (pass-through)" + )) } if (!is.null(deterministic_seed)) { morie_det_rng("drpfw", deterministic_seed) @@ -38,12 +37,14 @@ drpfw_dropout_forward <- function(x, p = 0.5, seed = 0L, training = TRUE, } mask <- array((stats::runif(length(x)) >= p) * 1.0, dim = dim(x)) y <- x * mask / (1 - p) - list(y = y, estimate = y, mask = mask, p = p, - kept_fraction = mean(mask), - method = "Dropout forward (inverted)") + list( + y = y, estimate = y, mask = mask, p = p, + kept_fraction = mean(mask), + method = "Dropout forward (inverted)" + ) } -#' @rdname drpfw_dropout_forward +#' @rdname morie_drpfw_dropout_forward #' @keywords internal #' @export -dropout_forward <- drpfw_dropout_forward +morie_dropout_forward <- morie_drpfw_dropout_forward diff --git a/r-package/morie/R/dtrsp.R b/r-package/morie/R/dtrsp.R index 032c8159e2..eae357c516 100644 --- a/r-package/morie/R/dtrsp.R +++ b/r-package/morie/R/dtrsp.R @@ -15,32 +15,35 @@ #' root_impurity, n_leaves, feature_importances, criterion, n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_decision_tree_split(x = rnorm(50), y = rnorm(50)) #' @export -decision_tree_split <- function(x, y, criterion = "gini", max_depth = 30L, - seed = 0L) { +morie_decision_tree_split <- function(x, y, criterion = "gini", max_depth = 30L, + seed = 0L) { if (!requireNamespace("rpart", quietly = TRUE)) { - stop("Function 'decision_tree_split' requires package 'rpart'. Install with install.packages('rpart').") + stop("Function 'morie_decision_tree_split' requires package 'rpart'. Install with install.packages('rpart').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); yf <- as.factor(y) + x <- as.matrix(x) + yf <- as.factor(y) colnames(x) <- colnames(x) %||% paste0("x", seq_len(ncol(x)) - 1L) parms_split <- if (criterion == "entropy") "information" else "gini" set.seed(seed) - df <- as.data.frame(x); df$.y <- yf - ctrl <- rpart::rpart.control(maxdepth = max_depth, cp = 0, - minsplit = 2L, minbucket = 1L, - xval = 0L) - fit <- rpart::rpart(.y ~ ., data = df, method = "class", - parms = list(split = parms_split), control = ctrl) + df <- as.data.frame(x) + df$.y <- yf + ctrl <- rpart::rpart.control( + maxdepth = max_depth, cp = 0, + minsplit = 2L, minbucket = 1L, + xval = 0L + ) + fit <- rpart::rpart(.y ~ ., + data = df, method = "class", + parms = list(split = parms_split), control = ctrl + ) fr <- fit$frame root <- fr[1, , drop = FALSE] # rpart variable names are factor codes via fit$splits / fit$frame$var root_feat_name <- as.character(root$var) - root_feat <- match(root_feat_name, colnames(x)) - 1L # 0-indexed parity + root_feat <- match(root_feat_name, colnames(x)) - 1L # 0-indexed parity splits <- fit$splits if (!is.null(splits) && nrow(splits) > 0) { root_thr <- as.numeric(splits[1, "index"]) @@ -50,8 +53,11 @@ decision_tree_split <- function(x, y, criterion = "gini", max_depth = 30L, # rpart impurity = root yval2 deviance / wt; surrogate: Gini at root tab <- table(yf) pk <- tab / sum(tab) - root_imp <- if (criterion == "entropy") -sum(ifelse(pk > 0, pk * log(pk), 0)) - else 1 - sum(pk^2) + root_imp <- if (criterion == "entropy") { + -sum(ifelse(pk > 0, pk * log(pk), 0)) + } else { + 1 - sum(pk^2) + } preds <- predict(fit, df, type = "class") acc <- mean(preds == yf) fi <- fit$variable.importance diff --git a/r-package/morie/R/dwnmn.R b/r-package/morie/R/dwnmn.R index 4ccbd4bcd0..700a4c51c3 100644 --- a/r-package/morie/R/dwnmn.R +++ b/r-package/morie/R/dwnmn.R @@ -6,50 +6,65 @@ #' series (or a panel of legislators). #' #' @param x Numeric vector (per-period ideal points) or matrix -#' (n_legislators by T). +#' (n_legislators by n_t). #' @param sigma_w Random-walk innovation SD. #' @return Named list with `smoothed`, `raw`, `sigma_w`, `n_periods`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' dwnmn(x = rnorm(50)) #' @export dwnmn <- function(x, sigma_w = 0.1) { if (is.matrix(x)) { - n <- nrow(x); T <- ncol(x); out <- matrix(0, n, T) - for (i in seq_len(n)) + n <- nrow(x) + n_t <- ncol(x) + out <- matrix(0, n, n_t) + for (i in seq_len(n)) { out[i, ] <- dwnmn(x[i, ], sigma_w = sigma_w)$smoothed - return(list(smoothed = out, raw = x, sigma_w = sigma_w, - n_periods = T, n_units = n, - method = "dynamic_wnominate")) + } + return(list( + smoothed = out, raw = x, sigma_w = sigma_w, + n_periods = n_t, n_units = n, + method = "morie_dynamic_wnominate" + )) + } + raw <- as.numeric(x) + n_t <- length(raw) + if (n_t == 0L) { + return(list( + smoothed = numeric(0), raw = numeric(0), + sigma_w = sigma_w, n_periods = 0L, + method = "morie_dynamic_wnominate" + )) } - raw <- as.numeric(x); T <- length(raw) - if (T == 0L) - return(list(smoothed = numeric(0), raw = numeric(0), - sigma_w = sigma_w, n_periods = 0L, - method = "dynamic_wnominate")) s2_obs <- stats::var(raw) + 1e-6 - m <- numeric(T); P <- numeric(T); m[1] <- raw[1]; P[1] <- s2_obs - for (t in 2:T) { - mp <- m[t - 1]; Pp <- P[t - 1] + sigma_w^2 + m <- numeric(n_t) + P <- numeric(n_t) + m[1] <- raw[1] + P[1] <- s2_obs + for (t in 2:n_t) { + mp <- m[t - 1] + Pp <- P[t - 1] + sigma_w^2 K <- Pp / (Pp + s2_obs) m[t] <- mp + K * (raw[t] - mp) P[t] <- (1 - K) * Pp } - ms <- m; Ps <- P - if (T >= 2L) for (t in (T - 1L):1L) { - Pp <- P[t] + sigma_w^2 - J <- P[t] / Pp - ms[t] <- m[t] + J * (ms[t + 1] - m[t]) - Ps[t] <- P[t] + J^2 * (Ps[t + 1] - Pp) + ms <- m + Ps <- P + if (n_t >= 2L) { + for (t in (n_t - 1L):1L) { + Pp <- P[t] + sigma_w^2 + J <- P[t] / Pp + ms[t] <- m[t] + J * (ms[t + 1] - m[t]) + Ps[t] <- P[t] + J^2 * (Ps[t + 1] - Pp) + } } - list(smoothed = ms, raw = raw, P_smoothed = Ps, sigma_w = sigma_w, - n_periods = T, method = "dynamic_wnominate") + list( + smoothed = ms, raw = raw, P_smoothed = Ps, sigma_w = sigma_w, + n_periods = n_t, method = "morie_dynamic_wnominate" + ) } #' @keywords internal #' @rdname dwnmn #' @export -dynamic_wnominate <- dwnmn +morie_dynamic_wnominate <- dwnmn diff --git a/r-package/morie/R/ebac.R b/r-package/morie/R/ebac.R index f46b047173..3ae95a8186 100644 --- a/r-package/morie/R/ebac.R +++ b/r-package/morie/R/ebac.R @@ -16,16 +16,18 @@ #' @return Non-negative numeric scalar: estimated BAC. #' @export #' @examples -#' calculate_ebac(drinks = 4, weight_lbs = 180, hours = 2, gender_constant = 0.73) -calculate_ebac <- function(drinks, weight_lbs, hours, gender_constant) { - if (weight_lbs <= 0) return(0.0) +#' morie_calculate_ebac(drinks = 4, weight_lbs = 180, hours = 2, gender_constant = 0.73) +morie_calculate_ebac <- function(drinks, weight_lbs, hours, gender_constant) { + if (weight_lbs <= 0) { + return(0.0) + } ebac <- (drinks * 5.14) / (weight_lbs * gender_constant) - (0.015 * hours) max(0.0, ebac) } #' Test whether an eBAC exceeds a legal driving limit #' -#' @param ebac Numeric eBAC value (e.g. from [calculate_ebac()]). +#' @param ebac Numeric eBAC value (e.g. from [morie_calculate_ebac()]). #' @param limit Legal threshold (default 0.08, the per-se limit in most #' Canadian and US jurisdictions). #' @@ -33,8 +35,8 @@ calculate_ebac <- function(drinks, weight_lbs, hours, gender_constant) { #' logical, to match the Python sibling and ease binary-outcome modelling.) #' @export #' @examples -#' is_over_legal_limit(0.09) -#' is_over_legal_limit(0.05, limit = 0.05) -is_over_legal_limit <- function(ebac, limit = 0.08) { +#' morie_is_over_legal_limit(0.09) +#' morie_is_over_legal_limit(0.05, limit = 0.05) +morie_is_over_legal_limit <- function(ebac, limit = 0.08) { if (ebac >= limit) 1L else 0L } diff --git a/r-package/morie/R/egrch.R b/r-package/morie/R/egrch.R index 96b712977e..92ee124562 100644 --- a/r-package/morie/R/egrch.R +++ b/r-package/morie/R/egrch.R @@ -1,18 +1,38 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: EGARCH(1,1) Gaussian negative log-likelihood. Extracted from +# the morie_egarch_model() base-R optimiser closure so the |beta| >= 1 +# stationarity guard is directly unit-testable. `r` is the centred +# series, `n` its length, `EZ` = E|Z| for a standard normal. +.egrch_negll <- function(p, r, n, EZ) { + omega <- p[1] + alpha <- p[2] + gamma <- p[3] + beta <- p[4] + if (abs(beta) >= 1) { + return(1e10) + } + log_s2 <- numeric(n) + log_s2[1] <- log(var(r) + 1e-12) + for (t in 2:n) { + z <- r[t - 1] / sqrt(exp(log_s2[t - 1]) + 1e-12) + log_s2[t] <- omega + beta * log_s2[t - 1] + alpha * (abs(z) - EZ) + gamma * z + } + s2 <- exp(log_s2) + 0.5 * sum(log(2 * pi * s2) + r^2 / s2) +} + #' EGARCH(1,1) asymmetric volatility model #' -#' @inheritParams garch_fit +#' @inheritParams morie_garch_fit #' @return Named list with \code{omega, alpha, gamma, beta, loglik, #' conditional_variance, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_egarch_model(x = rnorm(50)) #' @export -egarch_model <- function(x) { - r <- as.numeric(x) - mean(as.numeric(x)); n <- length(r) +morie_egarch_model <- function(x) { + r <- as.numeric(x) - mean(as.numeric(x)) + n <- length(r) if (n < 20) stop("Need >=20 obs.") if (requireNamespace("rugarch", quietly = TRUE)) { spec <- rugarch::ugarchspec( @@ -21,39 +41,35 @@ egarch_model <- function(x) { ) fit <- rugarch::ugarchfit(spec, r, solver = "hybrid") p <- rugarch::coef(fit) - return(list(omega = unname(p["omega"]), - alpha = unname(p["alpha1"]), - gamma = unname(p["gamma1"]), - beta = unname(p["beta1"]), - loglik = as.numeric(rugarch::likelihood(fit)), - conditional_variance = as.numeric(rugarch::sigma(fit))^2, - n = n, - method = "EGARCH(1,1) via rugarch")) + return(list( + omega = unname(p["omega"]), + alpha = unname(p["alpha1"]), + gamma = unname(p["gamma1"]), + beta = unname(p["beta1"]), + loglik = as.numeric(rugarch::likelihood(fit)), + conditional_variance = as.numeric(rugarch::sigma(fit))^2, + n = n, + method = "EGARCH(1,1) via rugarch" + )) } EZ <- sqrt(2 / pi) - neg_ll <- function(p) { - omega <- p[1]; alpha <- p[2]; gamma <- p[3]; beta <- p[4] - if (abs(beta) >= 1) return(1e10) - log_s2 <- numeric(n); log_s2[1] <- log(var(r) + 1e-12) - for (t in 2:n) { - z <- r[t - 1] / sqrt(exp(log_s2[t - 1]) + 1e-12) - log_s2[t] <- omega + beta * log_s2[t - 1] + alpha * (abs(z) - EZ) + gamma * z - } - s2 <- exp(log_s2) - 0.5 * sum(log(2 * pi * s2) + r^2 / s2) - } + neg_ll <- function(p) .egrch_negll(p, r, n, EZ) opt <- nlminb(c(0, 0.1, 0, 0.9), neg_ll, - lower = c(-5, -1, -1, -0.999), - upper = c(5, 1, 1, 0.999)) - log_s2 <- numeric(n); log_s2[1] <- log(var(r) + 1e-12) + lower = c(-5, -1, -1, -0.999), + upper = c(5, 1, 1, 0.999) + ) + log_s2 <- numeric(n) + log_s2[1] <- log(var(r) + 1e-12) for (t in 2:n) { z <- r[t - 1] / sqrt(exp(log_s2[t - 1]) + 1e-12) log_s2[t] <- opt$par[1] + opt$par[4] * log_s2[t - 1] + - opt$par[2] * (abs(z) - EZ) + opt$par[3] * z + opt$par[2] * (abs(z) - EZ) + opt$par[3] * z } - list(omega = opt$par[1], alpha = opt$par[2], - gamma = opt$par[3], beta = opt$par[4], - loglik = -opt$objective, - conditional_variance = exp(log_s2), n = n, - method = "EGARCH(1,1) Gaussian MLE (base R)") + list( + omega = opt$par[1], alpha = opt$par[2], + gamma = opt$par[3], beta = opt$par[4], + loglik = -opt$objective, + conditional_variance = exp(log_s2), n = n, + method = "EGARCH(1,1) Gaussian MLE (base R)" + ) } diff --git a/r-package/morie/R/entheo_analysis.R b/r-package/morie/R/entheo_analysis.R index ada90da5f8..1c18a2d4b7 100644 --- a/r-package/morie/R/entheo_analysis.R +++ b/r-package/morie/R/entheo_analysis.R @@ -22,26 +22,32 @@ beautiful_loop_metric <- function(eeg, fmri = NULL) { pair <- .entheo_extract_pair(eeg, fmri) warnings_vec <- character(0) if (is.null(pair$e_dmt) || is.null(pair$f_dmt)) { - return(list(score = NA_real_, - score_dmt = NA_real_, score_pcb = NA_real_, - contrast = NA_real_, - per_frame_dmt = NULL, per_frame_pcb = NULL, - warnings = "EEG or fMRI missing for primary condition")) + return(list( + score = NA_real_, + score_dmt = NA_real_, score_pcb = NA_real_, + contrast = NA_real_, + per_frame_dmt = NULL, per_frame_pcb = NULL, + warnings = "EEG or fMRI missing for primary condition" + )) } pf_dmt <- .entheo_binding_per_frame(pair$e_dmt, pair$f_dmt) score_dmt <- mean(abs(pf_dmt)) - pf_pcb <- NULL; score_pcb <- NA_real_; contrast <- NA_real_ + pf_pcb <- NULL + score_pcb <- NA_real_ + contrast <- NA_real_ if (!is.null(pair$e_pcb) && !is.null(pair$f_pcb)) { pf_pcb <- .entheo_binding_per_frame(pair$e_pcb, pair$f_pcb) score_pcb <- mean(abs(pf_pcb)) contrast <- score_dmt - score_pcb } - list(score = score_dmt, - score_dmt = score_dmt, score_pcb = score_pcb, - contrast = contrast, - per_frame_dmt = pf_dmt, per_frame_pcb = pf_pcb, - method = "Bayne-Laukkonen Beautiful Loop (v0.4.0-alpha toy)", - warnings = warnings_vec) + list( + score = score_dmt, + score_dmt = score_dmt, score_pcb = score_pcb, + contrast = contrast, + per_frame_dmt = pf_dmt, per_frame_pcb = pf_pcb, + method = "Bayne-Laukkonen Beautiful Loop (v0.4.0-alpha toy)", + warnings = warnings_vec + ) } @@ -57,26 +63,32 @@ beautiful_loop_metric <- function(eeg, fmri = NULL) { san_score <- function(eeg, fmri = NULL) { pair <- .entheo_extract_pair(eeg, fmri) if (is.null(pair$e_dmt) || is.null(pair$f_dmt)) { - return(list(score = NA_real_, - score_dmt = NA_real_, score_pcb = NA_real_, - contrast = NA_real_, - per_frame_dmt = NULL, per_frame_pcb = NULL, - warnings = "EEG or fMRI missing for primary condition")) + return(list( + score = NA_real_, + score_dmt = NA_real_, score_pcb = NA_real_, + contrast = NA_real_, + per_frame_dmt = NULL, per_frame_pcb = NULL, + warnings = "EEG or fMRI missing for primary condition" + )) } pf_dmt <- .entheo_san_per_frame(pair$e_dmt, pair$f_dmt) score_dmt <- mean(pf_dmt) - pf_pcb <- NULL; score_pcb <- NA_real_; contrast <- NA_real_ + pf_pcb <- NULL + score_pcb <- NA_real_ + contrast <- NA_real_ if (!is.null(pair$e_pcb) && !is.null(pair$f_pcb)) { pf_pcb <- .entheo_san_per_frame(pair$e_pcb, pair$f_pcb) score_pcb <- mean(pf_pcb) contrast <- score_dmt - score_pcb } - list(score = score_dmt, - score_dmt = score_dmt, score_pcb = score_pcb, - contrast = contrast, - per_frame_dmt = pf_dmt, per_frame_pcb = pf_pcb, - method = "Pirez Self-Aware Networks (v0.4.0-alpha toy)", - warnings = character(0)) + list( + score = score_dmt, + score_dmt = score_dmt, score_pcb = score_pcb, + contrast = contrast, + per_frame_dmt = pf_dmt, per_frame_pcb = pf_pcb, + method = "Pirez Self-Aware Networks (v0.4.0-alpha toy)", + warnings = character(0) + ) } @@ -87,8 +99,10 @@ san_score <- function(eeg, fmri = NULL) { .entheo_extract_pair <- function(record_or_eeg, fmri) { if (is.list(record_or_eeg) && !is.null(record_or_eeg$fmri)) { rec <- record_or_eeg - return(list(e_dmt = rec$eeg$data_dmt, f_dmt = rec$fmri$data_dmt, - e_pcb = rec$eeg$data_pcb, f_pcb = rec$fmri$data_pcb)) + return(list( + e_dmt = rec$eeg$data_dmt, f_dmt = rec$fmri$data_dmt, + e_pcb = rec$eeg$data_pcb, f_pcb = rec$fmri$data_pcb + )) } list(e_dmt = record_or_eeg, f_dmt = fmri, e_pcb = NULL, f_pcb = NULL) } @@ -107,11 +121,17 @@ san_score <- function(eeg, fmri = NULL) { e_tc <- if (is.matrix(e)) colMeans(e) else e f_tc <- if (is.matrix(f)) colMeans(f) else f n <- min(length(e_tc), length(f_tc)) - if (n == 0) return(list(e = e_tc, f = f_tc)) + if (n == 0) { + return(list(e = e_tc, f = f_tc)) + } .bin <- function(x) { - if (length(x) == n) return(x) + if (length(x) == n) { + return(x) + } step <- length(x) %/% n - if (step <= 1L) return(x[seq_len(n)]) + if (step <= 1L) { + return(x[seq_len(n)]) + } trimmed <- x[seq_len(step * n)] colMeans(matrix(trimmed, nrow = step)) } @@ -121,9 +141,12 @@ san_score <- function(eeg, fmri = NULL) { .entheo_binding_per_frame <- function(eeg, fmri) { env <- .entheo_envelope(eeg) al <- .entheo_align(env, fmri) - e_tc <- al$e; f_tc <- al$f + e_tc <- al$e + f_tc <- al$f n <- min(length(e_tc), length(f_tc)) - if (n < 4) return(rep(0, n)) + if (n < 4) { + return(rep(0, n)) + } # Replace NA introduced by stats::filter padding with 0. e_tc[is.na(e_tc)] <- 0 f_grad <- c(diff(f_tc), 0) @@ -145,9 +168,12 @@ san_score <- function(eeg, fmri = NULL) { .entheo_san_per_frame <- function(eeg, fmri) { al <- .entheo_align(eeg, fmri) - e_tc <- al$e; f_tc <- al$f + e_tc <- al$e + f_tc <- al$f n <- min(length(e_tc), length(f_tc)) - if (n < 4) return(rep(0, n)) + if (n < 4) { + return(rep(0, n)) + } joint <- rbind(e_tc[seq_len(n)], f_tc[seq_len(n)]) joint <- (joint - rowMeans(joint)) / (apply(joint, 1, stats::sd) + 1e-9) diff --git a/r-package/morie/R/entheo_data.R b/r-package/morie/R/entheo_data.R index 5cc1d7a224..3db1976652 100644 --- a/r-package/morie/R/entheo_data.R +++ b/r-package/morie/R/entheo_data.R @@ -45,7 +45,8 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { if (!root_exists) { warnings_vec <- c( sprintf("DMT_Imaging root not found at %s; using synthetic fixture.", root), - warnings_vec) + warnings_vec + ) } records <- vector("list", length(subs)) @@ -57,7 +58,8 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { rec <- .entheo_load_real(sid, root) if (is.null(rec)) { warnings_vec <- c(warnings_vec, sprintf( - "subject %s: .mat files present but unloadable (install R.matlab)", sid)) + "subject %s: .mat files present but unloadable (install R.matlab)", sid + )) } } if (is.null(rec)) { @@ -68,11 +70,11 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { } list( - records = records, - root = if (root_exists) root else NA_character_, + records = records, + root = if (root_exists) root else NA_character_, synthetic = any_synth, subject_ids = subs, - warnings = warnings_vec + warnings = warnings_vec ) } @@ -80,7 +82,9 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { #' @keywords internal .entheo_list_subjects <- function(root) { fmri_dir <- file.path(root, "fMRI") - if (!dir.exists(fmri_dir)) return(character(0)) + if (!dir.exists(fmri_dir)) { + return(character(0)) + } files <- list.files(fmri_dir, pattern = "^LongS\\d{2}(DMT|PCB)\\.mat$") ids <- unique(substr(files, 6L, 7L)) sort(ids) @@ -102,12 +106,16 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { data_pcb = matrix(stats::rnorm(n_chan * n_tp), n_chan, n_tp) ), fmri = list( - tr = 2.0, + tr = 2.0, n_parcels = n_parcels, - data_dmt = matrix(stats::rnorm(n_parcels * (n_tp %/% 4L)), - n_parcels, n_tp %/% 4L), - data_pcb = matrix(stats::rnorm(n_parcels * (n_tp %/% 4L)), - n_parcels, n_tp %/% 4L), + data_dmt = matrix( + stats::rnorm(n_parcels * (n_tp %/% 4L)), + n_parcels, n_tp %/% 4L + ), + data_pcb = matrix( + stats::rnorm(n_parcels * (n_tp %/% 4L)), + n_parcels, n_tp %/% 4L + ), motion_fd_mm = stats::runif(n_tp %/% 4L, 0, 0.6) ), behavioural = list( @@ -121,13 +129,19 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { #' @keywords internal .entheo_load_real <- function(subject_id, root) { - if (!requireNamespace("R.matlab", quietly = TRUE)) return(NULL) + if (!requireNamespace("R.matlab", quietly = TRUE)) { + return(NULL) + } f_dmt <- file.path(root, "fMRI", sprintf("LongS%sDMT.mat", subject_id)) f_pcb <- file.path(root, "fMRI", sprintf("LongS%sPCB.mat", subject_id)) - if (!(file.exists(f_dmt) && file.exists(f_pcb))) return(NULL) + if (!(file.exists(f_dmt) && file.exists(f_pcb))) { + return(NULL) + } blob_dmt <- tryCatch(R.matlab::readMat(f_dmt), error = function(e) NULL) blob_pcb <- tryCatch(R.matlab::readMat(f_pcb), error = function(e) NULL) - if (is.null(blob_dmt) || is.null(blob_pcb)) return(NULL) + if (is.null(blob_dmt) || is.null(blob_pcb)) { + return(NULL) + } .pick_largest <- function(blob) { best <- NULL @@ -144,11 +158,15 @@ load_dmt_imaging <- function(subject_id = NULL, root = NULL) { list( subject_id = subject_id, condition_order = c("DMT", "PCB"), - eeg = list(sfreq = NA_real_, channels = character(0), - data_dmt = NULL, data_pcb = NULL), - fmri = list(tr = 2.0, n_parcels = nrow(arr_dmt), - data_dmt = arr_dmt, data_pcb = arr_pcb, - motion_fd_mm = NULL), + eeg = list( + sfreq = NA_real_, channels = character(0), + data_dmt = NULL, data_pcb = NULL + ), + fmri = list( + tr = 2.0, n_parcels = nrow(arr_dmt), + data_dmt = arr_dmt, data_pcb = arr_pcb, + motion_fd_mm = NULL + ), behavioural = list(), .synthetic = FALSE, .paths = list(fmri_dmt = f_dmt, fmri_pcb = f_pcb) diff --git a/r-package/morie/R/entheo_preprocess.R b/r-package/morie/R/entheo_preprocess.R index 64219b25b1..c71680605b 100644 --- a/r-package/morie/R/entheo_preprocess.R +++ b/r-package/morie/R/entheo_preprocess.R @@ -40,17 +40,20 @@ preprocess_eeg <- function(record, cleaned$eeg[[key]] <- arr } - list(record = cleaned, - n_bad = n_bad_total, - sfreq = sfreq, - bandpass = bandpass, - notch = notch, - asr_threshold = asr_threshold, - n_channels = n_chan, - warnings = warnings_vec, - interpretation = sprintf( - "EEG bandpass-filtered (%g-%g Hz) and notch-filtered at %g Hz; %d sample(s) reconstructed by toy ASR.", - bandpass[1], bandpass[2], notch, n_bad_total)) + list( + record = cleaned, + n_bad = n_bad_total, + sfreq = sfreq, + bandpass = bandpass, + notch = notch, + asr_threshold = asr_threshold, + n_channels = n_chan, + warnings = warnings_vec, + interpretation = sprintf( + "EEG bandpass-filtered (%g-%g Hz) and notch-filtered at %g Hz; %d sample(s) reconstructed by toy ASR.", + bandpass[1], bandpass[2], notch, n_bad_total + ) + ) } @@ -88,8 +91,10 @@ preprocess_fmri <- function(record, n_scrubbed <- n_scrubbed + sum(bad) if (any(bad)) arr[, bad] <- 0 } else { - warnings_vec <- c(warnings_vec, - sprintf("fmri.motion_fd_mm absent -- skipping scrubbing on %s", key)) + warnings_vec <- c( + warnings_vec, + sprintf("fmri.motion_fd_mm absent -- skipping scrubbing on %s", key) + ) } sv <- tryCatch(svd(arr), error = function(e) NULL) if (!is.null(sv)) { @@ -103,15 +108,21 @@ preprocess_fmri <- function(record, cleaned$fmri[[key]] <- arr } - list(record = cleaned, - n_scrubbed = n_scrubbed, - motion_threshold_mm = motion_threshold_mm, - n_noise_components = n_noise_components, - n_parcels = n_parcels, - warnings = warnings_vec, - interpretation = sprintf( - "Motion-scrubbed %d volume(s) above %g mm FD; top-%d singular components projected out as toy ICA-AROMA stand-in.", - n_scrubbed, motion_threshold_mm, n_noise_components)) + list( + record = cleaned, + n_scrubbed = n_scrubbed, + motion_threshold_mm = motion_threshold_mm, + n_noise_components = n_noise_components, + n_parcels = n_parcels, + warnings = warnings_vec, + interpretation = sprintf( + paste0( + "Motion-scrubbed %d volume(s) above %g mm FD; top-%d singular ", + "components projected out as toy ICA-AROMA stand-in." + ), + n_scrubbed, motion_threshold_mm, n_noise_components + ) + ) } @@ -142,9 +153,12 @@ preprocess_fmri <- function(record, .entheo_notch <- function(x, sfreq, freq, q = 30) { if (requireNamespace("signal", quietly = TRUE)) { bw <- freq / q - bf <- signal::butter(2, c((freq - bw / 2) / (sfreq / 2), - (freq + bw / 2) / (sfreq / 2)), - type = "stop") + bf <- signal::butter(2, c( + (freq - bw / 2) / (sfreq / 2), + (freq + bw / 2) / (sfreq / 2) + ), + type = "stop" + ) out <- t(apply(x, 1, function(row) signal::filtfilt(bf, row))) return(out) } diff --git a/r-package/morie/R/ewtma.R b/r-package/morie/R/ewtma.R index f8dbb8867b..4c8c80f3cc 100644 --- a/r-package/morie/R/ewtma.R +++ b/r-package/morie/R/ewtma.R @@ -2,27 +2,28 @@ #' EWMA volatility (RiskMetrics 1996) #' -#' @inheritParams garch_fit +#' @inheritParams morie_garch_fit #' @param lambda Decay factor in (0,1). Default 0.94 (daily RiskMetrics). #' @return Named list with \code{conditional_variance, conditional_volatility, #' lambda, n, last_variance, last_volatility, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ewma_volatility(x = rnorm(50)) #' @export -ewma_volatility <- function(x, lambda = 0.94) { - r <- as.numeric(x); n <- length(r) +morie_ewma_volatility <- function(x, lambda = 0.94) { + r <- as.numeric(x) + n <- length(r) if (n < 2) stop("Need >=2 obs.") if (lambda <= 0 || lambda >= 1) stop("lambda must be in (0,1).") r2 <- r^2 - s2 <- numeric(n); s2[1] <- r2[1] + s2 <- numeric(n) + s2[1] <- r2[1] for (t in 2:n) s2[t] <- lambda * s2[t - 1] + (1 - lambda) * r2[t - 1] - list(conditional_variance = s2, - conditional_volatility = sqrt(s2), - lambda = lambda, n = n, - last_variance = s2[n], - last_volatility = sqrt(s2[n]), - method = "EWMA RiskMetrics") + list( + conditional_variance = s2, + conditional_volatility = sqrt(s2), + lambda = lambda, n = n, + last_variance = s2[n], + last_volatility = sqrt(s2[n]), + method = "EWMA RiskMetrics" + ) } diff --git a/r-package/morie/R/extvm.R b/r-package/morie/R/extvm.R index a56a156c0f..455e8d424c 100644 --- a/r-package/morie/R/extvm.R +++ b/r-package/morie/R/extvm.R @@ -1,4 +1,26 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: GEV per-observation log-density (Coles xi convention). +# Lifted from the extvm() optimiser closure so the xi ~ 0 (Gumbel) and +# out-of-support branches are directly unit-testable; the BFGS search in +# extvm() is not guaranteed to probe xi within 1e-8 of zero. +.extvm_log_gev <- function(par, x) { + mu <- par[1] + sigma <- exp(par[2]) + xi <- par[3] + z <- (x - mu) / sigma + if (abs(xi) < 1e-8) { + ll <- -log(sigma) - z - exp(-z) + } else { + arg <- 1 + xi * z + if (any(arg <= 0)) { + return(rep(-1e10, length(x))) + } + ll <- -log(sigma) - (1 + 1 / xi) * log(arg) - arg^(-1 / xi) + } + ll +} + #' Generalised Extreme Value fit by ML (Coles 2001) #' #' Fits F(x) = exp(-(1 + xi (x - mu)/sigma)^(-1/xi)) by maximum @@ -9,39 +31,33 @@ #' @return list: mu, sigma, xi, se_mu, se_sigma, se_xi, loglik, n, method. #' @keywords internal extvm <- function(x) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { return(list(estimate = NA_real_, n = n, method = "GEV (n<5)")) - # GEV density (Coles xi convention) - log_gev <- function(par, x) { - mu <- par[1]; sigma <- exp(par[2]); xi <- par[3] - z <- (x - mu) / sigma - if (abs(xi) < 1e-8) { - ll <- -log(sigma) - z - exp(-z) - } else { - arg <- 1 + xi * z - if (any(arg <= 0)) return(rep(-1e10, length(x))) - ll <- -log(sigma) - (1 + 1/xi) * log(arg) - arg^(-1/xi) - } - ll } - nll <- function(par) -sum(log_gev(par, x)) + nll <- function(par) -sum(.extvm_log_gev(par, x)) init <- c(mean(x), log(stats::sd(x)), 0.1) fit <- stats::optim(init, nll, method = "BFGS", hessian = TRUE) - mu <- fit$par[1]; sigma <- exp(fit$par[2]); xi <- fit$par[3] + mu <- fit$par[1] + sigma <- exp(fit$par[2]) + xi <- fit$par[3] loglik <- -fit$value # Hessian is wrt (mu, log_sigma, xi); convert to (mu, sigma, xi) J <- diag(c(1, sigma, 1)) cov_mat <- tryCatch(J %*% solve(fit$hessian) %*% t(J), - error = function(e) matrix(NA, 3, 3)) + error = function(e) matrix(NA, 3, 3) + ) ses <- sqrt(pmax(diag(cov_mat), 0)) - list(mu = as.numeric(mu), sigma = as.numeric(sigma), xi = as.numeric(xi), - se_mu = as.numeric(ses[1]), se_sigma = as.numeric(ses[2]), - se_xi = as.numeric(ses[3]), - loglik = as.numeric(loglik), - estimate = as.numeric(mu), se = as.numeric(ses[1]), - n = as.integer(n), - method = "GEV MLE (Coles 2001)") + list( + mu = as.numeric(mu), sigma = as.numeric(sigma), xi = as.numeric(xi), + se_mu = as.numeric(ses[1]), se_sigma = as.numeric(ses[2]), + se_xi = as.numeric(ses[3]), + loglik = as.numeric(loglik), + estimate = as.numeric(mu), se = as.numeric(ses[1]), + n = as.integer(n), + method = "GEV MLE (Coles 2001)" + ) } # CANONICAL TEST @@ -52,4 +68,4 @@ extvm <- function(x) { #' @rdname extvm #' @keywords internal #' @export -extreme_value_gev <- extvm +morie_extreme_value_gev <- extvm diff --git a/r-package/morie/R/fast.R b/r-package/morie/R/fast.R index eda8227e65..f93a2b845c 100644 --- a/r-package/morie/R/fast.R +++ b/r-package/morie/R/fast.R @@ -51,7 +51,9 @@ morie_var <- function(x, ddof = 1) { morie_var_cpp(as.numeric(x), as.integer(ddof)) } else { n <- length(x) - if (n - ddof <= 0) return(NA_real_) + if (n - ddof <= 0) { + return(NA_real_) + } sum((x - mean(x))^2) / (n - ddof) } } @@ -71,10 +73,13 @@ morie_cor_pearson <- function(x, y) { # Internal: detect whether the Rcpp .so was successfully built. .cpp_available <- function() { - tryCatch({ - exists("morie_normal_pdf_cpp", mode = "function") && - is.function(get("morie_normal_pdf_cpp")) - }, error = function(e) FALSE) + tryCatch( + { + exists("morie_normal_pdf_cpp", mode = "function") && + is.function(get("morie_normal_pdf_cpp")) + }, + error = function(e) FALSE + ) } #' Is the R-side JIT acceleration active? @@ -83,11 +88,10 @@ morie_cor_pearson <- function(x, y) { #' Returns TRUE when the Rcpp .so was built and loaded; FALSE when #' falling back to base-R implementations. #' +#' @return A logical scalar: \code{TRUE} when the compiled Rcpp backend was +#' built and loaded, \code{FALSE} when falling back to base-R kernels. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_fast_available() #' @export morie_fast_available <- function() { .cpp_available() diff --git a/r-package/morie/R/flsha.R b/r-package/morie/R/flsha.R index b5700f6350..fef3a0cc33 100644 --- a/r-package/morie/R/flsha.R +++ b/r-package/morie/R/flsha.R @@ -13,8 +13,12 @@ flash_attention <- function(Q, K = NULL, V = NULL, block_size = 32L, mask = NULL) { if (is.null(K)) K <- Q if (is.null(V)) V <- Q - Q <- as.matrix(Q); K <- as.matrix(K); V <- as.matrix(V) - N <- nrow(Q); d <- ncol(Q); M <- nrow(K) + Q <- as.matrix(Q) + K <- as.matrix(K) + V <- as.matrix(V) + N <- nrow(Q) + d <- ncol(Q) + M <- nrow(K) scale <- 1 / sqrt(d) out <- matrix(0, N, d) row_max <- rep(-Inf, N) @@ -28,14 +32,16 @@ flash_attention <- function(Q, K = NULL, V = NULL, block_size = 32L, if (!is.null(mask)) s <- s + as.matrix(mask)[, j:je, drop = FALSE] bm <- apply(s, 1L, max) new_max <- pmax(row_max, bm) - alpha <- exp(row_max - new_max) # length N - beta <- exp(sweep(s, 1L, new_max, "-")) # N x k, row-wise + alpha <- exp(row_max - new_max) # length N + beta <- exp(sweep(s, 1L, new_max, "-")) # N x k, row-wise row_den <- row_den * alpha + rowSums(beta) out <- sweep(out, 1L, alpha, "*") + beta %*% Vj row_max <- new_max j <- je + 1L } out <- sweep(out, 1L, row_den, "/") - list(tensor = out, block_size = block_size, - method = "flash-attention") + list( + tensor = out, block_size = block_size, + method = "flash-attention" + ) } diff --git a/r-package/morie/R/frns_metrics.R b/r-package/morie/R/frns_metrics.R index 94ce15a63a..24b925df1b 100644 --- a/r-package/morie/R/frns_metrics.R +++ b/r-package/morie/R/frns_metrics.R @@ -9,12 +9,12 @@ #' only measure disparity in predictions that already exist. #' #' Functions: -#' * `fairness_disparate_impact()`: the EEOC four-fifths rule. -#' * `fairness_demographic_parity()`: favourable-rate gap. -#' * `fairness_equalized_odds()`: TPR/FPR gaps (needs ground truth). -#' * `fairness_average_odds_difference()`: mean TPR+FPR gap. -#' * `fairness_gini()`: concentration of a score distribution. -#' * `fairness_bias_amplification()`: composite `Delta_parity * G`. +#' * `morie_fairness_disparate_impact()`: the EEOC four-fifths rule. +#' * `morie_fairness_demographic_parity()`: favourable-rate gap. +#' * `morie_fairness_equalized_odds()`: TPR/FPR gaps (needs ground truth). +#' * `morie_fairness_average_odds_difference()`: mean TPR+FPR gap. +#' * `morie_fairness_gini()`: concentration of a score distribution. +#' * `morie_fairness_bias_amplification()`: composite `Delta_parity * G`. #' #' Each returns a named `list` with the metric value, a per-group #' breakdown, any advisory `warnings`, and a plain-language @@ -27,10 +27,17 @@ #' (Lacherade, Szabo, Krikava & Aeby, 2021); and Barman & Barman, #' arXiv:2603.18987 (the Bias Amplification Score). #' +#' @return Each callable in this module returns a named \code{list} with the +#' metric \code{value}, a per-group breakdown, advisory \code{warnings}, and +#' a plain-language \code{interpretation}. +#' @examples +#' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0) +#' race <- c(rep("A", 5), rep("B", 5)) +#' morie_fairness_disparate_impact(pred, race, privileged = "A")$value #' @name frns_metrics NULL -.FRNS_FOUR_FIFTHS <- 0.8 # EEOC four-fifths adverse-impact threshold +.FRNS_FOUR_FIFTHS <- 0.8 # EEOC four-fifths adverse-impact threshold # ---- internal helpers ----------------------------------------------------- @@ -40,9 +47,13 @@ NULL lengths <- vapply(args, function(a) length(a[[2]]), integer(1)) if (length(unique(lengths)) > 1L) { nm <- vapply(args, function(a) a[[1]], character(1)) - stop(sprintf("length mismatch across inputs: %s", - paste(sprintf("%s=%d", nm, lengths), collapse = ", ")), - call. = FALSE) + stop( + sprintf( + "length mismatch across inputs: %s", + paste(sprintf("%s=%d", nm, lengths), collapse = ", ") + ), + call. = FALSE + ) } if (lengths[1] == 0L) stop("inputs are empty", call. = FALSE) } @@ -65,17 +76,22 @@ NULL if (!is.null(privileged)) { pk <- as.character(privileged) if (!pk %in% keys) { - stop(sprintf("privileged group '%s' not found; groups present: %s", - pk, paste(keys, collapse = ", ")), call. = FALSE) + stop(sprintf( + "privileged group '%s' not found; groups present: %s", + pk, paste(keys, collapse = ", ") + ), call. = FALSE) } return(list(privileged = pk, warning = NULL)) } rate_vals <- vapply(rates, function(r) r$rate, numeric(1)) pk <- keys[which.max(rate_vals)] list(privileged = pk, warning = sprintf( - paste0("`privileged` not given; inferred as '%s' (the group with the ", - "highest favourable-outcome rate). Pass `privileged=` to audit ", - "against a specific reference group."), pk)) + paste0( + "`privileged` not given; inferred as '%s' (the group with the ", + "highest favourable-outcome rate). Pass `privileged=` to audit ", + "against a specific reference group." + ), pk + )) } .frns_rates_from_labels <- function(y_true, y_pred, group, favorable) { @@ -83,13 +99,16 @@ NULL out <- list() for (g in groups) { m <- group == g - gt <- y_true[m]; gp <- y_pred[m] + gt <- y_true[m] + gp <- y_pred[m] pos <- gt == favorable neg <- !pos tpr <- if (any(pos)) mean(gp[pos] == favorable) else NA_real_ fpr <- if (any(neg)) mean(gp[neg] == favorable) else NA_real_ - out[[as.character(g)]] <- list(value = g, n = sum(m), - tpr = tpr, fpr = fpr) + out[[as.character(g)]] <- list( + value = g, n = sum(m), + tpr = tpr, fpr = fpr + ) } out } @@ -100,7 +119,9 @@ NULL x <- sort(as.numeric(x)) n <- length(x) total <- sum(x) - if (n < 2L || total <= 0) return(0) + if (n < 2L || total <= 0) { + return(0) + } idx <- seq_len(n) (2 * sum(idx * x)) / (n * total) - (n + 1) / n } @@ -108,7 +129,9 @@ NULL .frns_worst_abs <- function(values) { # The element with the largest absolute value (finite only); NA if none. finite <- values[is.finite(values)] - if (length(finite) == 0L) return(NA_real_) + if (length(finite) == 0L) { + return(NA_real_) + } finite[which.max(abs(finite))] } @@ -134,10 +157,10 @@ NULL #' @examples #' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0) #' race <- c(rep("A", 5), rep("B", 5)) -#' res <- fairness_disparate_impact(pred, race, privileged = "A") -#' res$value # 0.6 (group B rate 0.6 / group A rate 1.0) -#' res$adverse_impact # TRUE -fairness_disparate_impact <- function(y_pred, group, privileged = NULL, +#' res <- morie_fairness_disparate_impact(pred, race, privileged = "A") +#' res$value # 0.6 (group B rate 0.6 / group A rate 1.0) +#' res$adverse_impact # TRUE +morie_fairness_disparate_impact <- function(y_pred, group, privileged = NULL, favorable = 1) { .frns_check_aligned(list("y_pred", y_pred), list("group", group)) rates <- .frns_favorable_rates(y_pred, group, favorable) @@ -151,16 +174,23 @@ fairness_disparate_impact <- function(y_pred, group, privileged = NULL, base <- rates[[priv]]$rate if (base == 0) { warnings <- c(warnings, sprintf( - paste0("privileged group '%s' has a zero favourable-outcome rate; ", - "disparate-impact ratios are undefined and reported as NA."), - priv)) + paste0( + "privileged group '%s' has a zero favourable-outcome rate; ", + "disparate-impact ratios are undefined and reported as NA." + ), + priv + )) } ratios <- list() for (k in names(rates)) { - ratios[[k]] <- if (k == priv) 1.0 - else if (base == 0) NA_real_ - else rates[[k]]$rate / base + ratios[[k]] <- if (k == priv) { + 1.0 + } else if (base == 0) { + NA_real_ + } else { + rates[[k]]$rate / base + } } non_ref <- unlist(ratios[names(ratios) != priv]) finite <- non_ref[is.finite(non_ref)] @@ -170,12 +200,18 @@ fairness_disparate_impact <- function(y_pred, group, privileged = NULL, interp <- if (!is.finite(worst)) { "Disparate-impact ratio could not be computed (privileged group has no favourable outcomes)." } else if (adverse) { - sprintf(paste0("Adverse impact detected: the worst disparate-impact ", - "ratio is %.3f, below the 0.80 four-fifths threshold."), - worst) + sprintf( + paste0( + "Adverse impact detected: the worst disparate-impact ", + "ratio is %.3f, below the 0.80 four-fifths threshold." + ), + worst + ) } else { - sprintf(paste0("No adverse impact under the four-fifths rule: the ", - "worst disparate-impact ratio is %.3f (>= 0.80)."), worst) + sprintf(paste0( + "No adverse impact under the four-fifths rule: the ", + "worst disparate-impact ratio is %.3f (>= 0.80)." + ), worst) } list( @@ -199,16 +235,16 @@ fairness_disparate_impact <- function(y_pred, group, privileged = NULL, #' `rate(group) - rate(privileged)`. Demographic parity holds when every #' group receives favourable outcomes at the same rate. #' -#' @inheritParams fairness_disparate_impact +#' @inheritParams morie_fairness_disparate_impact #' @return A named list: `value` (largest absolute gap), `gaps`, `rates`, #' `privileged`, `warnings`, `interpretation`. #' @export #' @examples #' pred <- c(1, 1, 1, 1, 0, 0, 0, 1, 0, 0) #' race <- c(rep("A", 5), rep("B", 5)) -#' res <- fairness_demographic_parity(pred, race, privileged = "A") -#' res$value # -0.6 (group B rate 0.2 minus group A rate 0.8) -fairness_demographic_parity <- function(y_pred, group, privileged = NULL, +#' res <- morie_fairness_demographic_parity(pred, race, privileged = "A") +#' res$value # -0.6 (group B rate 0.2 minus group A rate 0.8) +morie_fairness_demographic_parity <- function(y_pred, group, privileged = NULL, favorable = 1) { .frns_check_aligned(list("y_pred", y_pred), list("group", group)) rates <- .frns_favorable_rates(y_pred, group, favorable) @@ -227,14 +263,17 @@ fairness_demographic_parity <- function(y_pred, group, privileged = NULL, worst <- .frns_worst_abs(non_ref) interp <- sprintf( - paste0("The largest favourable-rate gap is %+.3f (group rate minus ", - "the '%s' reference rate). %s"), + paste0( + "The largest favourable-rate gap is %+.3f (group rate minus ", + "the '%s' reference rate). %s" + ), worst, priv, if (is.finite(worst) && abs(worst) >= 0.1) { "Favourable-outcome rates differ materially across groups." } else { "Favourable-outcome rates are close to parity." - }) + } + ) list( value = worst, @@ -267,39 +306,51 @@ fairness_demographic_parity <- function(y_pred, group, privileged = NULL, #' @export #' @examples #' truth <- c(1, 0, 1, 0, 1, 0, 1, 0) -#' pred <- c(1, 0, 1, 0, 1, 1, 0, 1) -#' race <- c(rep("A", 4), rep("B", 4)) -#' res <- fairness_equalized_odds(truth, pred, race, privileged = "A") -#' res$violation # TRUE -fairness_equalized_odds <- function(y_true, y_pred, group, +#' pred <- c(1, 0, 1, 0, 1, 1, 0, 1) +#' race <- c(rep("A", 4), rep("B", 4)) +#' res <- morie_fairness_equalized_odds(truth, pred, race, privileged = "A") +#' res$violation # TRUE +morie_fairness_equalized_odds <- function(y_true, y_pred, group, privileged = NULL, favorable = 1) { - .frns_check_aligned(list("y_true", y_true), list("y_pred", y_pred), - list("group", group)) + .frns_check_aligned( + list("y_true", y_true), list("y_pred", y_pred), + list("group", group) + ) per <- .frns_rates_from_labels(y_true, y_pred, group, favorable) if (length(per) < 2L) { stop("need at least two groups to measure disparity", call. = FALSE) } warnings <- character(0) - rate_view <- lapply(per, function(d) list(value = d$value, n = d$n, - rate = d$tpr)) + rate_view <- lapply(per, function(d) { + list( + value = d$value, n = d$n, + rate = d$tpr + ) + }) pr <- .frns_resolve_privileged(privileged, rate_view) priv <- pr$privileged if (!is.null(pr$warning)) warnings <- c(warnings, pr$warning) base_tpr <- per[[priv]]$tpr base_fpr <- per[[priv]]$fpr - tpr_gaps <- list(); fpr_gaps <- list() + tpr_gaps <- list() + fpr_gaps <- list() for (k in names(per)) { tpr_gaps[[k]] <- per[[k]]$tpr - base_tpr fpr_gaps[[k]] <- per[[k]]$fpr - base_fpr if (is.na(per[[k]]$tpr) || is.na(per[[k]]$fpr)) { warnings <- c(warnings, sprintf( - paste0("group '%s' has no positive or no negative ground-truth ", - "cases; its TPR/FPR (and gaps) are partly undefined."), k)) + paste0( + "group '%s' has no positive or no negative ground-truth ", + "cases; its TPR/FPR (and gaps) are partly undefined." + ), k + )) } } - all_gaps <- c(unlist(tpr_gaps[names(tpr_gaps) != priv]), - unlist(fpr_gaps[names(fpr_gaps) != priv])) + all_gaps <- c( + unlist(tpr_gaps[names(tpr_gaps) != priv]), + unlist(fpr_gaps[names(fpr_gaps) != priv]) + ) worst <- .frns_worst_abs(all_gaps) violation <- isTRUE(is.finite(worst) && abs(worst) >= 0.1) @@ -309,7 +360,8 @@ fairness_equalized_odds <- function(y_true, y_pred, group, "Error rates differ substantially across groups." } else { "TPR and FPR are close across groups." - }) + } + ) list( value = worst, @@ -333,30 +385,37 @@ fairness_equalized_odds <- function(y_true, y_pred, group, #' parity of errors. This is the single-number summary used in IBM #' AIF360 and the COMPAS *XAI Stories* audit. #' -#' @inheritParams fairness_equalized_odds +#' @inheritParams morie_fairness_equalized_odds #' @return A named list: `value` (largest absolute AOD), #' `average_odds_difference`, `rates`, `privileged`, `warnings`, #' `interpretation`. #' @export #' @examples #' truth <- c(1, 0, 1, 0, 1, 0, 1, 0) -#' pred <- c(1, 0, 1, 0, 1, 1, 0, 1) -#' race <- c(rep("A", 4), rep("B", 4)) -#' res <- fairness_average_odds_difference(truth, pred, race, -#' privileged = "A") -#' res$value # 0.25 -fairness_average_odds_difference <- function(y_true, y_pred, group, +#' pred <- c(1, 0, 1, 0, 1, 1, 0, 1) +#' race <- c(rep("A", 4), rep("B", 4)) +#' res <- morie_fairness_average_odds_difference(truth, pred, race, +#' privileged = "A" +#' ) +#' res$value # 0.25 +morie_fairness_average_odds_difference <- function(y_true, y_pred, group, privileged = NULL, favorable = 1) { - .frns_check_aligned(list("y_true", y_true), list("y_pred", y_pred), - list("group", group)) + .frns_check_aligned( + list("y_true", y_true), list("y_pred", y_pred), + list("group", group) + ) per <- .frns_rates_from_labels(y_true, y_pred, group, favorable) if (length(per) < 2L) { stop("need at least two groups to measure disparity", call. = FALSE) } warnings <- character(0) - rate_view <- lapply(per, function(d) list(value = d$value, n = d$n, - rate = d$tpr)) + rate_view <- lapply(per, function(d) { + list( + value = d$value, n = d$n, + rate = d$tpr + ) + }) pr <- .frns_resolve_privileged(privileged, rate_view) priv <- pr$privileged if (!is.null(pr$warning)) warnings <- c(warnings, pr$warning) @@ -366,15 +425,18 @@ fairness_average_odds_difference <- function(y_true, y_pred, group, aod <- list() for (k in names(per)) { aod[[k]] <- 0.5 * ((per[[k]]$fpr - base_fpr) + - (per[[k]]$tpr - base_tpr)) + (per[[k]]$tpr - base_tpr)) } non_ref <- unlist(aod[names(aod) != priv]) worst <- .frns_worst_abs(non_ref) interp <- sprintf( - paste0("The largest average odds difference is %+.3f. Zero is parity; ", - "values away from zero mean the combined error profile favours ", - "one group over another."), worst) + paste0( + "The largest average odds difference is %+.3f. Zero is parity; ", + "values away from zero mean the combined error profile favours ", + "one group over another." + ), worst + ) list( value = worst, @@ -403,16 +465,17 @@ fairness_average_odds_difference <- function(y_true, y_pred, group, #' `warnings`, `interpretation`. #' @export #' @examples -#' fairness_gini(c(5, 5, 5, 5))$value # 0 -#' fairness_gini(c(0, 0, 0, 100))$value # 0.75 -fairness_gini <- function(values, group = NULL) { +#' morie_fairness_gini(c(5, 5, 5, 5))$value # 0 +#' morie_fairness_gini(c(0, 0, 0, 100))$value # 0.75 +morie_fairness_gini <- function(values, group = NULL) { if (length(values) == 0L) stop("values is empty", call. = FALSE) vals <- as.numeric(values) warnings <- character(0) if (any(vals < 0)) { warnings <- c(warnings, paste0( "negative values present; the Gini coefficient assumes ", - "non-negative quantities and the result may be uninformative.")) + "non-negative quantities and the result may be uninformative." + )) } overall <- .frns_gini(vals) @@ -424,12 +487,14 @@ fairness_gini <- function(values, group = NULL) { } } - interp <- sprintf("Gini = %.3f. %s", overall, + interp <- sprintf( + "Gini = %.3f. %s", overall, if (overall >= 0.5) { "The quantity is highly concentrated." } else { "The quantity is relatively evenly spread." - }) + } + ) list( value = overall, @@ -453,7 +518,7 @@ fairness_gini <- function(values, group = NULL) { #' Reimplemented from Barman & Barman, "Unmasking Algorithmic Bias in #' Predictive Policing" (arXiv:2603.18987). #' -#' @inheritParams fairness_disparate_impact +#' @inheritParams morie_fairness_disparate_impact #' @return A named list: `value` (BAS), `bias_amplification_score`, #' `demographic_parity_gap`, `gini`, `rates`, `privileged`, #' `warnings`, `interpretation`. @@ -461,9 +526,9 @@ fairness_gini <- function(values, group = NULL) { #' @examples #' pred <- c(1, 1, 1, 1, 0, 0, 0, 0) #' race <- c(rep("A", 4), rep("B", 4)) -#' res <- fairness_bias_amplification(pred, race, privileged = "A") -#' res$value # -0.5 (parity gap -1.0 times Gini 0.5) -fairness_bias_amplification <- function(y_pred, group, privileged = NULL, +#' res <- morie_fairness_bias_amplification(pred, race, privileged = "A") +#' res$value # -0.5 (parity gap -1.0 times Gini 0.5) +morie_fairness_bias_amplification <- function(y_pred, group, privileged = NULL, favorable = 1) { .frns_check_aligned(list("y_pred", y_pred), list("group", group)) rates <- .frns_favorable_rates(y_pred, group, favorable) @@ -487,13 +552,16 @@ fairness_bias_amplification <- function(y_pred, group, privileged = NULL, bas <- delta_parity * gini interp <- sprintf( - paste0("Bias Amplification Score = %+.4f (parity gap %+.3f times ", - "Gini %.3f). %s"), bas, delta_parity, gini, + paste0( + "Bias Amplification Score = %+.4f (parity gap %+.3f times ", + "Gini %.3f). %s" + ), bas, delta_parity, gini, if (abs(bas) >= 0.05) { "Both a directional disparity and substantial inequality are present." } else { "At least one component is small, so little amplification is indicated." - }) + } + ) list( value = bas, diff --git a/r-package/morie/R/frns_predpol.R b/r-package/morie/R/frns_predpol.R index a06fb6d9a5..079b0ec219 100644 --- a/r-package/morie/R/frns_predpol.R +++ b/r-package/morie/R/frns_predpol.R @@ -10,16 +10,27 @@ #' composition. #' #' Functions: -#' * `predpol_aggregate_areas()`: roll per-record data up to one row +#' * `morie_predpol_aggregate_areas()`: roll per-record data up to one row #' per area. -#' * `predpol_calibration_audit()`: Spearman calibration plus a +#' * `morie_predpol_calibration_audit()`: Spearman calibration plus a #' per-group mean rank gap (the over-/under-prediction signal). -#' * `predpol_score_disparity()`: descriptive per-group risk-score +#' * `morie_predpol_score_disparity()`: descriptive per-group risk-score #' summary with a one-way ANOVA. #' #' Written from the project's published methodology; no code copied #' (that repository carries no licence and is not redistributable). #' +#' @return \code{morie_predpol_aggregate_areas()} returns a per-area +#' \code{data.frame}; \code{morie_predpol_calibration_audit()} and +#' \code{morie_predpol_score_disparity()} return named \code{list}s of audit +#' statistics, per-group breakdowns, and a plain-language +#' \code{interpretation}. +#' @examples +#' agg <- morie_predpol_aggregate_areas( +#' area = c("a", "a", "b", "b"), risk = c(10, 20, 30, 40), +#' outcome = c(1, 0, 1, 1) +#' ) +#' agg$mean_risk #' @name frns_predpol NULL @@ -27,7 +38,9 @@ NULL .frns_worst_abs_named <- function(x) { # name of the element with the largest absolute value. x <- x[is.finite(x)] - if (length(x) == 0L) return(NA_character_) + if (length(x) == 0L) { + return(NA_character_) + } names(x)[which.max(abs(x))] } @@ -47,12 +60,13 @@ NULL #' `n_records`. #' @export #' @examples -#' agg <- predpol_aggregate_areas( +#' agg <- morie_predpol_aggregate_areas( #' area = c("a", "a", "b", "b"), risk = c(10, 20, 30, 40), -#' outcome = c(1, 0, 1, 1)) -#' agg$mean_risk # 15 35 -#' agg$outcome_rate # 0.5 1.0 -predpol_aggregate_areas <- function(area, risk, outcome, group = NULL, +#' outcome = c(1, 0, 1, 1) +#' ) +#' agg$mean_risk # 15 35 +#' agg$outcome_rate # 0.5 1.0 +morie_predpol_aggregate_areas <- function(area, risk, outcome, group = NULL, population = NULL) { if (length(area) != length(risk) || length(area) != length(outcome)) { stop("area, risk and outcome must be the same length", call. = FALSE) @@ -67,15 +81,18 @@ predpol_aggregate_areas <- function(area, risk, outcome, group = NULL, n_records <- vapply(areas, function(a) sum(area == a), integer(1)) if (is.null(population)) { - outcome_rate <- vapply(areas, function(a) mean(outcome[area == a]), - numeric(1)) + outcome_rate <- vapply( + areas, function(a) mean(outcome[area == a]), + numeric(1) + ) } else { if (!is.null(names(population))) { pops <- as.numeric(population[areas]) } else { if (length(population) != length(area)) { stop("population vector must be the same length as area", - call. = FALSE) + call. = FALSE + ) } population <- as.numeric(population) pops <- vapply(areas, function(a) population[area == a][1], numeric(1)) @@ -95,10 +112,12 @@ predpol_aggregate_areas <- function(area, risk, outcome, group = NULL, }, character(1)) } - list(areas = areas, mean_risk = unname(mean_risk), - outcome_rate = unname(outcome_rate), - group = if (is.null(maj)) NULL else unname(maj), - n_records = unname(n_records)) + list( + areas = areas, mean_risk = unname(mean_risk), + outcome_rate = unname(outcome_rate), + group = if (is.null(maj)) NULL else unname(maj), + n_records = unname(n_records) + ) } @@ -118,20 +137,22 @@ predpol_aggregate_areas <- function(area, risk, outcome, group = NULL, #' `warnings`, `interpretation`. #' @export #' @examples -#' res <- predpol_calibration_audit( +#' res <- morie_predpol_calibration_audit( #' areas = c("d1", "d2", "d3", "d4", "d5", "d6"), #' mean_risk = c(90, 80, 70, 30, 20, 10), #' outcome_rate = c(10, 20, 30, 70, 80, 90), -#' group = c("X", "X", "X", "Y", "Y", "Y")) -#' res$group_rank_gap$X # 3 (group X over-predicted) -#' res$spearman # -1 (perfectly miscalibrated) -predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, +#' group = c("X", "X", "X", "Y", "Y", "Y") +#' ) +#' res$group_rank_gap$X # 3 (group X over-predicted) +#' res$spearman # -1 (perfectly miscalibrated) +morie_predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, group) { n <- length(areas) if (!(n == length(mean_risk) && n == length(outcome_rate) && - n == length(group))) { + n == length(group))) { stop("areas, mean_risk, outcome_rate and group must all align", - call. = FALSE) + call. = FALSE + ) } if (n < 2L) { stop("need at least two areas to compare rankings", call. = FALSE) @@ -146,13 +167,17 @@ predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, if (!all(finite)) { warnings <- c(warnings, sprintf( "%d area(s) had a non-finite risk or outcome value and were dropped.", - sum(!finite))) - areas <- areas[finite]; mean_risk <- mean_risk[finite] - outcome_rate <- outcome_rate[finite]; group <- group[finite] + sum(!finite) + )) + areas <- areas[finite] + mean_risk <- mean_risk[finite] + outcome_rate <- outcome_rate[finite] + group <- group[finite] n <- length(areas) if (n < 2L) { stop("fewer than two areas remain after dropping non-finite rows", - call. = FALSE) + call. = FALSE + ) } } @@ -162,11 +187,13 @@ predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, rank_gap <- outcome_rank - risk_rank ct <- suppressWarnings( - stats::cor.test(mean_risk, outcome_rate, method = "spearman")) + stats::cor.test(mean_risk, outcome_rate, method = "spearman") + ) rho <- unname(ct$estimate) pval <- ct$p.value - per_group <- list(); group_n <- list() + per_group <- list() + group_n <- list() for (gv in unique(group)) { m <- group == gv per_group[[gv]] <- mean(rank_gap[m]) @@ -177,24 +204,32 @@ predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, worst <- pg[[worst_group]] cal <- if (rho >= 0.7) { - sprintf(paste0("Overall the ranking is well calibrated (Spearman ", - "rho = %.2f)."), rho) + sprintf(paste0( + "Overall the ranking is well calibrated (Spearman ", + "rho = %.2f)." + ), rho) } else if (rho >= 0.3) { sprintf("Overall calibration is weak (Spearman rho = %.2f).", rho) } else { - sprintf("Overall the ranking is miscalibrated (Spearman rho = %.2f).", - rho) + sprintf( + "Overall the ranking is miscalibrated (Spearman rho = %.2f).", + rho + ) } disp <- if (abs(worst) <= 0.5) { "No group's areas are systematically mis-ranked." } else if (worst > 0) { - sprintf(paste0("Group '%s' is over-predicted: its areas are ranked, ", - "on average, %.1f rank positions more dangerous than ", - "their realised outcomes warrant."), worst_group, worst) + sprintf(paste0( + "Group '%s' is over-predicted: its areas are ranked, ", + "on average, %.1f rank positions more dangerous than ", + "their realised outcomes warrant." + ), worst_group, worst) } else { - sprintf(paste0("Group '%s' is under-predicted: its areas are ranked, ", - "on average, %.1f rank positions less dangerous than ", - "their realised outcomes."), worst_group, abs(worst)) + sprintf(paste0( + "Group '%s' is under-predicted: its areas are ranked, ", + "on average, %.1f rank positions less dangerous than ", + "their realised outcomes." + ), worst_group, abs(worst)) } list( @@ -215,7 +250,7 @@ predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, #' Reports per-group n / mean / median / sd, a one-way ANOVA for #' whether group membership relates to the score, and each group's #' mean-score gap from a reference group. A significant gap is not -#' itself proof of bias; pair this with `predpol_calibration_audit()`. +#' itself proof of bias; pair this with `morie_predpol_calibration_audit()`. #' #' @param score Continuous risk score, one per individual. #' @param group Protected attribute, one per individual. @@ -226,12 +261,13 @@ predpol_calibration_audit <- function(areas, mean_risk, outcome_rate, #' `reference`, `warnings`, `interpretation`. #' @export #' @examples -#' res <- predpol_score_disparity( +#' res <- morie_predpol_score_disparity( #' score = c(9, 10, 11, 19, 20, 21), -#' group = c("A", "A", "A", "B", "B", "B")) -#' res$value # 10 (group means 10 and 20) -#' res$significant # TRUE -predpol_score_disparity <- function(score, group, reference = NULL) { +#' group = c("A", "A", "A", "B", "B", "B") +#' ) +#' res$value # 10 (group means 10 and 20) +#' res$significant # TRUE +morie_predpol_score_disparity <- function(score, group, reference = NULL) { if (length(score) != length(group)) { stop("score and group must be the same length", call. = FALSE) } @@ -244,8 +280,10 @@ predpol_score_disparity <- function(score, group, reference = NULL) { finite <- is.finite(score) if (!all(finite)) { warnings <- c(warnings, sprintf( - "%d non-finite score value(s) dropped.", sum(!finite))) - score <- score[finite]; group <- group[finite] + "%d non-finite score value(s) dropped.", sum(!finite) + )) + score <- score[finite] + group <- group[finite] } groups <- unique(group) if (length(groups) < 2L) { @@ -256,23 +294,31 @@ predpol_score_disparity <- function(score, group, reference = NULL) { names(means) <- groups per_group <- lapply(groups, function(g) { gv <- score[group == g] - list(n = length(gv), mean = mean(gv), median = stats::median(gv), - sd = if (length(gv) > 1L) stats::sd(gv) else NA_real_) + list( + n = length(gv), mean = mean(gv), median = stats::median(gv), + sd = if (length(gv) > 1L) stats::sd(gv) else NA_real_ + ) }) names(per_group) <- groups ow <- tryCatch( stats::oneway.test(score ~ factor(group), var.equal = TRUE), - error = function(e) NULL) + error = function(e) NULL + ) if (is.null(ow)) { - fstat <- NA_real_; pval <- NA_real_ + fstat <- NA_real_ + pval <- NA_real_ warnings <- c(warnings, "ANOVA could not be computed.") } else { - fstat <- unname(ow$statistic); pval <- ow$p.value + fstat <- unname(ow$statistic) + pval <- ow$p.value } - ref <- if (is.null(reference)) names(means)[which.min(means)] - else as.character(reference) + ref <- if (is.null(reference)) { + names(means)[which.min(means)] + } else { + as.character(reference) + } if (!ref %in% names(means)) { stop(sprintf("reference group '%s' not found", ref), call. = FALSE) } @@ -281,19 +327,31 @@ predpol_score_disparity <- function(score, group, reference = NULL) { significant <- isTRUE(is.finite(pval) && pval < 0.05) anova_line <- if (is.finite(pval)) { - sprintf(paste0("A one-way ANOVA finds the between-group difference %s ", - "(F = %.2f, p = %.4f). "), - if (significant) "statistically significant" - else "not significant", fstat, pval) - } else "" + sprintf( + paste0( + "A one-way ANOVA finds the between-group difference %s ", + "(F = %.2f, p = %.4f). " + ), + if (significant) { + "statistically significant" + } else { + "not significant" + }, fstat, pval + ) + } else { + "" + } interp <- paste0( - sprintf("Group mean risk scores span %.2f points (reference '%s'). ", - spread, ref), + sprintf( + "Group mean risk scores span %.2f points (reference '%s'). ", + spread, ref + ), anova_line, "Note: a score gap is not itself evidence of bias; pair this with ", - "predpol_calibration_audit(), which compares the score against ", - "realised outcomes.") + "morie_predpol_calibration_audit(), which compares the score against ", + "realised outcomes." + ) list( value = spread, diff --git a/r-package/morie/R/frns_temporal.R b/r-package/morie/R/frns_temporal.R index caff8cd79b..3a15761584 100644 --- a/r-package/morie/R/frns_temporal.R +++ b/r-package/morie/R/frns_temporal.R @@ -12,6 +12,15 @@ #' from one deployment cycle to the next and must be recomputed per #' period and per city. #' +#' @return The module's audit callable returns a named \code{list} with the +#' worst per-city Disparate Impact Ratio range, per-city and per-cell +#' breakdowns, and a plain-language \code{interpretation}. +#' @examples +#' period <- c(rep("p1", 10), rep("p2", 10)) +#' city <- rep("A", 20) +#' pred <- rep(c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0), 2) +#' grp <- rep(c(rep("X", 5), rep("Y", 5)), 2) +#' morie_predpol_temporal_audit(period, city, pred, grp, privileged = "X") #' @name frns_temporal NULL @@ -39,9 +48,9 @@ NULL #' city <- rep("A", 20) #' pred <- rep(c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0), 2) #' grp <- rep(c(rep("X", 5), rep("Y", 5)), 2) -#' res <- predpol_temporal_audit(period, city, pred, grp, privileged = "X") -#' res$per_city$A$dir_range # 0 — disparity is stable across periods -predpol_temporal_audit <- function(period, city, y_pred, group, +#' res <- morie_predpol_temporal_audit(period, city, pred, grp, privileged = "X") +#' res$per_city$A$dir_range # 0 — disparity is stable across periods +morie_predpol_temporal_audit <- function(period, city, y_pred, group, privileged = NULL, favorable = 1) { n <- length(period) if (!(n == length(city) && n == length(y_pred) && n == length(group))) { @@ -55,12 +64,17 @@ predpol_temporal_audit <- function(period, city, y_pred, group, warnings <- character(0) if (is.null(privileged)) { gs <- unique(group) - rates <- vapply(gs, function(g) mean(y_pred[group == g] == favorable), - numeric(1)) + rates <- vapply( + gs, function(g) mean(y_pred[group == g] == favorable), + numeric(1) + ) privileged <- gs[which.max(rates)] warnings <- c(warnings, sprintf( - paste0("`privileged` not given; inferred globally as '%s' so every ", - "cell uses the same reference group."), privileged)) + paste0( + "`privileged` not given; inferred globally as '%s' so every ", + "cell uses the same reference group." + ), privileged + )) } else { privileged <- as.character(privileged) } @@ -77,24 +91,34 @@ predpol_temporal_audit <- function(period, city, y_pred, group, skipped <- skipped + 1L next } - di <- fairness_disparate_impact( - cy, cg, privileged = privileged, favorable = favorable)$value - pg <- fairness_demographic_parity( - cy, cg, privileged = privileged, favorable = favorable)$value - rate_vec <- vapply(cgu, function(g) mean(cy[cg == g] == favorable), - numeric(1)) - gini <- fairness_gini(rate_vec)$value - bas <- fairness_bias_amplification( - cy, cg, privileged = privileged, favorable = favorable)$value + di <- morie_fairness_disparate_impact( + cy, cg, + privileged = privileged, favorable = favorable + )$value + pg <- morie_fairness_demographic_parity( + cy, cg, + privileged = privileged, favorable = favorable + )$value + rate_vec <- vapply( + cgu, function(g) mean(cy[cg == g] == favorable), + numeric(1) + ) + gini <- morie_fairness_gini(rate_vec)$value + bas <- morie_fairness_bias_amplification( + cy, cg, + privileged = privileged, favorable = favorable + )$value cells[[length(cells) + 1L]] <- list( city = cc, period = pp, n = sum(m), - dir = di, parity_gap = pg, gini = gini, bas = bas) + dir = di, parity_gap = pg, gini = gini, bas = bas + ) } } if (skipped > 0L) { warnings <- c(warnings, sprintf( "%d (city, period) cell(s) were skipped (fewer than two groups, or the privileged group absent).", - skipped)) + skipped + )) } if (length(cells) == 0L) { stop("no (city, period) cell had enough groups to audit", call. = FALSE) @@ -126,20 +150,28 @@ predpol_temporal_audit <- function(period, city, y_pred, group, mean_dirs_f <- mean_dirs[is.finite(mean_dirs)] cross <- if (length(mean_dirs_f) >= 2L) { max(mean_dirs_f) - min(mean_dirs_f) - } else 0 + } else { + 0 + } stab <- if (is.finite(worst_range) && worst_range >= 0.5) { - sprintf(paste0("Bias is temporally unstable: the Disparate Impact ", - "Ratio swings by up to %.3f across periods within a ", - "single city; the metric must be recomputed every ", - "period."), worst_range) + sprintf(paste0( + "Bias is temporally unstable: the Disparate Impact ", + "Ratio swings by up to %.3f across periods within a ", + "single city; the metric must be recomputed every ", + "period." + ), worst_range) } else { "The Disparate Impact Ratio is reasonably stable across periods." } div <- if (length(per_city) >= 2L && cross >= 0.3) { - sprintf(paste0(" Bias also diverges across cities: mean annual DIR ", - "spans %.3f between cities."), cross) - } else "" + sprintf(paste0( + " Bias also diverges across cities: mean annual DIR ", + "spans %.3f between cities." + ), cross) + } else { + "" + } list( value = worst_range, diff --git a/r-package/morie/R/fwpas.R b/r-package/morie/R/fwpas.R index a60685d13b..58b2020ef6 100644 --- a/r-package/morie/R/fwpas.R +++ b/r-package/morie/R/fwpas.R @@ -15,12 +15,9 @@ #' (= \code{a}), \code{activation}, \code{method}. #' @references Goodfellow, Bengio & Courville (2016), Deep Learning, Ch 6. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_fwpas_forward_pass_dense(x = rnorm(50), w = rnorm(3), b = rnorm(3)) #' @export -fwpas_forward_pass_dense <- function(x, w, b, activation = "sigmoid") { +morie_fwpas_forward_pass_dense <- function(x, w, b, activation = "sigmoid") { x <- as.matrix(x) w <- as.matrix(w) b <- as.numeric(b) @@ -35,22 +32,24 @@ fwpas_forward_pass_dense <- function(x, w, b, activation = "sigmoid") { z <- sweep(z, 2L, b, "+") a <- switch(activation, "identity" = z, - "linear" = z, - "none" = z, - "sigmoid" = 1 / (1 + exp(-z)), - "tanh" = tanh(z), - "relu" = pmax(0, z), - "softmax" = { + "linear" = z, + "none" = z, + "sigmoid" = 1 / (1 + exp(-z)), + "tanh" = tanh(z), + "relu" = pmax(z, 0), + "softmax" = { ez <- exp(z - apply(z, 1L, max)) sweep(ez, 1L, rowSums(ez), "/") }, stop(sprintf("Unknown activation: %s", activation)) ) - list(z = z, a = a, estimate = a, activation = activation, - method = "Dense layer forward pass") + list( + z = z, a = a, estimate = a, activation = activation, + method = "Dense layer forward pass" + ) } -#' @rdname fwpas_forward_pass_dense +#' @rdname morie_fwpas_forward_pass_dense #' @keywords internal #' @export -forward_pass_dense <- fwpas_forward_pass_dense +morie_forward_pass_dense <- morie_fwpas_forward_pass_dense diff --git a/r-package/morie/R/fzbrd.R b/r-package/morie/R/fzbrd.R index 0b822a3d92..c2a1204622 100644 --- a/r-package/morie/R/fzbrd.R +++ b/r-package/morie/R/fzbrd.R @@ -12,26 +12,30 @@ #' @return Named list with estimate, F_h, F_ch, se, h, c, t, n, method. #' @importFrom stats median pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzbrd(x = rnorm(50)) #' @export fzbrd <- function(x, t = NULL, h = NULL, c = 2) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "fzbrd - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "fzbrd - too few obs" + )) + } if (is.null(t)) t <- stats::median(x) if (is.null(h)) h <- .morie_silverman_h(x) if (c <= 1) stop("c must be > 1") - F_h <- mean(stats::pnorm((t - x) / h)) + F_h <- mean(stats::pnorm((t - x) / h)) F_ch <- mean(stats::pnorm((t - x) / (c * h))) F_br <- (c^2 * F_h - F_ch) / (c^2 - 1) var_F <- F_h * (1 - F_h) / n var_inflate <- (c^4 + 1) / (c^2 - 1)^2 - list(estimate = F_br, F_h = F_h, F_ch = F_ch, - se = sqrt(var_F * var_inflate), h = h, c = c, t = t, n = n, - method = "Fauzi bias-reduced KDFE (Ch 2)") + list( + estimate = F_br, F_h = F_h, F_ch = F_ch, + se = sqrt(var_F * var_inflate), h = h, c = c, t = t, n = n, + method = "Fauzi bias-reduced KDFE (Ch 2)" + ) } # CANONICAL TEST @@ -41,4 +45,4 @@ fzbrd <- function(x, t = NULL, h = NULL, c = 2) { #' @rdname fzbrd #' @keywords internal #' @export -fauzi_bias_reduced_kdfe <- fzbrd +morie_fauzi_bias_reduced_kdfe <- fzbrd diff --git a/r-package/morie/R/fzcvm.R b/r-package/morie/R/fzcvm.R index b7a2e5eaf0..9ce3497c36 100644 --- a/r-package/morie/R/fzcvm.R +++ b/r-package/morie/R/fzcvm.R @@ -11,41 +11,58 @@ #' @return Named list: statistic, p_value, h, n, method. #' @importFrom stats sd pnorm qnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzcvm(x = rnorm(50)) #' @export fzcvm <- function(x, cdf = "norm", args = NULL, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "fzcvm - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "fzcvm - too few obs" + )) + } if (is.null(h)) h <- .morie_silverman_h(x) if (is.function(cdf)) { t_grid <- seq(min(x), max(x), length.out = max(200L, n)) - F_ref <- vapply(t_grid, cdf, numeric(1)) + F_ref <- vapply(t_grid, cdf, numeric(1)) } else if (identical(cdf, "norm")) { if (is.null(args)) args <- list(mean(x), stats::sd(x)) u <- (seq_len(n) - 0.5) / n t_grid <- stats::qnorm(u, mean = args[[1]], sd = args[[2]]) - F_ref <- stats::pnorm(t_grid, mean = args[[1]], sd = args[[2]]) - } else stop("supply a function for non-normal cdf") - F_hat <- vapply(t_grid, function(g) mean(stats::pnorm((g - x) / h)), - numeric(1)) + F_ref <- stats::pnorm(t_grid, mean = args[[1]], sd = args[[2]]) + } else { + stop("supply a function for non-normal cdf") + } + F_hat <- vapply( + t_grid, function(g) mean(stats::pnorm((g - x) / h)), + numeric(1) + ) w2 <- n * mean((F_hat - F_ref)^2) p <- .morie_cvm_pvalue(w2 / n) - list(statistic = w2, p_value = p, h = h, n = n, - method = "Fauzi kernel-smoothed Cramer-von Mises (Ch 5)") + list( + statistic = w2, p_value = p, h = h, n = n, + method = "Fauzi kernel-smoothed Cramer-von Mises (Ch 5)" + ) } .morie_cvm_pvalue <- function(w2) { - if (w2 <= 0) return(1.0) - tbl <- list(c(0.347, 0.10), c(0.461, 0.05), c(0.581, 0.025), - c(0.743, 0.01), c(1.168, 0.001)) - if (w2 < tbl[[1]][1]) return(0.5) - if (w2 > tbl[[length(tbl)]][1]) return(tbl[[length(tbl)]][2] * 0.5) + if (w2 <= 0) { + return(1.0) + } + tbl <- list( + c(0.347, 0.10), c(0.461, 0.05), c(0.581, 0.025), + c(0.743, 0.01), c(1.168, 0.001) + ) + if (w2 < tbl[[1]][1]) { + return(0.5) + } + if (w2 > tbl[[length(tbl)]][1]) { + return(tbl[[length(tbl)]][2] * 0.5) + } for (i in seq_len(length(tbl) - 1)) { - a <- tbl[[i]]; b <- tbl[[i + 1]] + a <- tbl[[i]] + b <- tbl[[i + 1]] if (w2 >= a[1] && w2 <= b[1]) { lp <- log(a[2]) + (log(b[2]) - log(a[2])) * (w2 - a[1]) / (b[1] - a[1]) return(exp(lp)) @@ -61,4 +78,4 @@ fzcvm <- function(x, cdf = "norm", args = NULL, h = NULL) { #' @rdname fzcvm #' @keywords internal #' @export -fauzi_cvm_smoothed <- fzcvm +morie_fauzi_cvm_smoothed <- fzcvm diff --git a/r-package/morie/R/fzedg.R b/r-package/morie/R/fzedg.R index 8cf07b73f1..7a0f48f20b 100644 --- a/r-package/morie/R/fzedg.R +++ b/r-package/morie/R/fzedg.R @@ -14,26 +14,31 @@ #' cornish_fisher_correction, skew, p1z, z, p, n, method. #' @importFrom stats dnorm pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzedg(x = rnorm(50)) #' @export fzedg <- function(x, z = 1.96, p = 0.5) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(estimate = NA_real_, n = n, - method = "fzedg - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + estimate = NA_real_, n = n, + method = "fzedg - too few obs" + )) + } skew <- (1 - 2 * p) / sqrt(p * (1 - p)) p1z <- -(skew / 6) * (z^2 - 1) - phi_z <- stats::dnorm(z); Phi_z <- stats::pnorm(z) + phi_z <- stats::dnorm(z) + Phi_z <- stats::pnorm(z) correction <- p1z * phi_z / sqrt(n) cf_correction <- (skew / 6) * (z^2 - 1) / sqrt(n) - list(estimate = Phi_z + correction, - normal_approx = Phi_z, - edgeworth_correction = correction, - cornish_fisher_correction = cf_correction, - skew = skew, p1z = p1z, z = z, p = p, n = n, - method = "Fauzi Edgeworth expansion for kernel quantile (Ch 3)") + list( + estimate = Phi_z + correction, + normal_approx = Phi_z, + edgeworth_correction = correction, + cornish_fisher_correction = cf_correction, + skew = skew, p1z = p1z, z = z, p = p, n = n, + method = "Fauzi Edgeworth expansion for kernel quantile (Ch 3)" + ) } # CANONICAL TEST @@ -42,4 +47,4 @@ fzedg <- function(x, z = 1.96, p = 0.5) { #' @rdname fzedg #' @keywords internal #' @export -fauzi_edgeworth_quantile <- fzedg +morie_fauzi_edgeworth_quantile <- fzedg diff --git a/r-package/morie/R/fzhdc.R b/r-package/morie/R/fzhdc.R index 534ebb7d16..7bc769b06b 100644 --- a/r-package/morie/R/fzhdc.R +++ b/r-package/morie/R/fzhdc.R @@ -15,49 +15,64 @@ #' @importFrom utils combn #' @importFrom stats var #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzhdc(x = rnorm(50)) #' @export fzhdc <- function(x, kernel = NULL, max_pairs = 2000L, seed = 0L) { - x <- as.numeric(x); n <- length(x) - if (n < 4L) return(list(estimate = NA_real_, n = n, - method = "fzhdc - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 4L) { + return(list( + estimate = NA_real_, n = n, + method = "fzhdc - too few obs" + )) + } if (is.null(kernel)) kernel <- function(a, b) 0.5 * (a - b)^2 total <- n * (n - 1) / 2 if (total <= max_pairs) { pairs <- utils::combn(n, 2) } else { set.seed(seed) - seen <- character(); pairs_list <- list() + seen <- character() + pairs_list <- list() while (length(pairs_list) < max_pairs) { ij <- sample.int(n, 2) - i <- min(ij); j <- max(ij) + i <- min(ij) + j <- max(ij) k <- paste(i, j, sep = "-") if (!(k %in% seen)) { - seen <- c(seen, k); pairs_list[[length(pairs_list) + 1]] <- c(i, j) + seen <- c(seen, k) + pairs_list[[length(pairs_list) + 1]] <- c(i, j) } } pairs <- do.call(cbind, pairs_list) } - g_vals <- vapply(seq_len(ncol(pairs)), - function(p) kernel(x[pairs[1, p]], x[pairs[2, p]]), - numeric(1)) + g_vals <- vapply( + seq_len(ncol(pairs)), + function(p) kernel(x[pairs[1, p]], x[pairs[2, p]]), + numeric(1) + ) theta <- mean(g_vals) sigma2 <- stats::var(g_vals) - g1 <- numeric(n); cnt <- numeric(n) + g1 <- numeric(n) + cnt <- numeric(n) for (p in seq_len(ncol(pairs))) { - i <- pairs[1, p]; j <- pairs[2, p]; v <- g_vals[p] - g1[i] <- g1[i] + v; cnt[i] <- cnt[i] + 1 - g1[j] <- g1[j] + v; cnt[j] <- cnt[j] + 1 + i <- pairs[1, p] + j <- pairs[2, p] + v <- g_vals[p] + g1[i] <- g1[i] + v + cnt[i] <- cnt[i] + 1 + g1[j] <- g1[j] + v + cnt[j] <- cnt[j] + 1 } - cnt[cnt == 0] <- 1; g1 <- g1 / cnt - theta + cnt[cnt == 0] <- 1 + g1 <- g1 / cnt - theta sigma1_sq <- stats::var(g1) var_U <- 4 * sigma1_sq / n - list(estimate = theta, sigma1_sq = sigma1_sq, sigma2_sq = sigma2, - se = sqrt(max(var_U, 0)), n = n, n_pairs = ncol(pairs), - method = "Fauzi H-decomposition of degree-2 U-statistic (Ch 5)") + list( + estimate = theta, sigma1_sq = sigma1_sq, sigma2_sq = sigma2, + se = sqrt(max(var_U, 0)), n = n, n_pairs = ncol(pairs), + method = "Fauzi H-decomposition of degree-2 U-statistic (Ch 5)" + ) } # CANONICAL TEST @@ -67,4 +82,4 @@ fzhdc <- function(x, kernel = NULL, max_pairs = 2000L, seed = 0L) { #' @rdname fzhdc #' @keywords internal #' @export -fauzi_h_decomposition <- fzhdc +morie_fauzi_h_decomposition <- fzhdc diff --git a/r-package/morie/R/fzhok.R b/r-package/morie/R/fzhok.R index e14e9066d3..946ccb5b93 100644 --- a/r-package/morie/R/fzhok.R +++ b/r-package/morie/R/fzhok.R @@ -13,24 +13,28 @@ #' @return Named list with estimate, h, t, order, mu_r, R_K, n, method. #' @importFrom stats median dnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzhok(x = rnorm(50)) #' @export fzhok <- function(x, t = NULL, h = NULL, order = 4L) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "fzhok - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "fzhok - too few obs" + )) + } if (order != 4L) stop("only order=4 implemented") if (is.null(t)) t <- stats::median(x) if (is.null(h)) h <- .morie_silverman_h(x) u <- (t - x) / h k4 <- 0.5 * (3 - u^2) * stats::dnorm(u) f_hat <- mean(k4) / h - list(estimate = f_hat, h = h, t = t, order = order, - mu_r = -3, R_K = 27 / (32 * sqrt(pi)), n = n, - method = "Fauzi higher-order (4) kernel density (Ch 1)") + list( + estimate = f_hat, h = h, t = t, order = order, + mu_r = -3, R_K = 27 / (32 * sqrt(pi)), n = n, + method = "Fauzi higher-order (4) kernel density (Ch 1)" + ) } # CANONICAL TEST @@ -40,4 +44,4 @@ fzhok <- function(x, t = NULL, h = NULL, order = 4L) { #' @rdname fzhok #' @keywords internal #' @export -fauzi_higher_order_kernel <- fzhok +morie_fauzi_higher_order_kernel <- fzhok diff --git a/r-package/morie/R/fzkdf.R b/r-package/morie/R/fzkdf.R index 5ce587ed07..70205ffab0 100644 --- a/r-package/morie/R/fzkdf.R +++ b/r-package/morie/R/fzkdf.R @@ -15,28 +15,32 @@ #' @return Named list: estimate, bias, variance, se, h, t, n, method. #' @importFrom stats median sd quantile dnorm pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzkdf(x = rnorm(50)) #' @export fzkdf <- function(x, t = NULL, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "fzkdf - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "fzkdf - too few obs" + )) + } if (is.null(t)) t <- stats::median(x) if (is.null(h)) h <- .morie_silverman_h(x) mu2 <- 1.0 - rK <- 1 / (2 * sqrt(pi)) + rK <- 1 / (2 * sqrt(pi)) z <- (t - x) / h F_hat <- mean(stats::pnorm(z)) f_hat <- mean(stats::dnorm(z) / h) fp_hat <- mean(-z * stats::dnorm(z) / (h * h)) bias <- (h^2 / 2) * mu2 * fp_hat var <- max(F_hat * (1 - F_hat) / n - 2 * h * rK * f_hat / n, 0) - list(estimate = F_hat, bias = bias, variance = var, - se = sqrt(var), h = h, t = t, n = n, - method = "Fauzi KDFE bias-variance (Ch 2)") + list( + estimate = F_hat, bias = bias, variance = var, + se = sqrt(var), h = h, t = t, n = n, + method = "Fauzi KDFE bias-variance (Ch 2)" + ) } # `.morie_silverman_h` moved to R/_helpers_fauzi.R so every fz*.R caller can @@ -49,4 +53,4 @@ fzkdf <- function(x, t = NULL, h = NULL) { #' @rdname fzkdf #' @keywords internal #' @export -fauzi_kdfe_properties <- fzkdf +morie_fauzi_kdfe_properties <- fzkdf diff --git a/r-package/morie/R/fzksm.R b/r-package/morie/R/fzksm.R index 4d3f55ffa5..f8c758caf0 100644 --- a/r-package/morie/R/fzksm.R +++ b/r-package/morie/R/fzksm.R @@ -13,15 +13,17 @@ #' @return Named list: statistic, p_value, h, n, method. #' @importFrom stats sd pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzksm(x = rnorm(50)) #' @export fzksm <- function(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "fzksm - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "fzksm - too few obs" + )) + } if (is.null(h)) h <- .morie_silverman_h(x) if (is.function(cdf)) { F0 <- cdf @@ -31,10 +33,13 @@ fzksm <- function(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) { } else { stop("supply a function for non-normal cdf") } - lo <- min(x) - 6 * h; hi <- max(x) + 6 * h + lo <- min(x) - 6 * h + hi <- max(x) + 6 * h grid <- seq(lo, hi, length.out = n_grid) - F_hat <- vapply(grid, function(g) mean(stats::pnorm((g - x) / h)), - numeric(1)) + F_hat <- vapply( + grid, function(g) mean(stats::pnorm((g - x) / h)), + numeric(1) + ) F_ref <- vapply(grid, F0, numeric(1)) D_n <- max(abs(F_hat - F_ref)) # Kolmogorov asymptotic tail: P(K > x) = 2 * sum_{k>=1} (-1)^(k-1) exp(-2 k^2 x^2) @@ -42,8 +47,10 @@ fzksm <- function(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) { k <- 1:100 pval <- 2 * sum((-1)^(k - 1) * exp(-2 * k^2 * lam^2)) pval <- max(0, min(1, pval)) - list(statistic = D_n, p_value = pval, h = h, n = n, - method = "Fauzi kernel-smoothed KS test (Ch 5)") + list( + statistic = D_n, p_value = pval, h = h, n = n, + method = "Fauzi kernel-smoothed KS test (Ch 5)" + ) } # CANONICAL TEST @@ -53,4 +60,4 @@ fzksm <- function(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) { #' @rdname fzksm #' @keywords internal #' @export -fauzi_ks_smoothed <- fzksm +morie_fauzi_ks_smoothed <- fzksm diff --git a/r-package/morie/R/fzlst.R b/r-package/morie/R/fzlst.R index 0763d13e4d..d11d0751be 100644 --- a/r-package/morie/R/fzlst.R +++ b/r-package/morie/R/fzlst.R @@ -12,15 +12,17 @@ #' @return Named list with estimate, se, n, method. #' @importFrom stats sd quantile dnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzlst(x = rnorm(50)) #' @export fzlst <- function(x, score = NULL, n_quad = 200L) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "fzlst - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "fzlst - too few obs" + )) + } if (is.null(score)) score <- function(u) rep(1, length(u)) x_sorted <- sort(x) fine_n <- n * 16L @@ -28,33 +30,39 @@ fzlst <- function(x, score = NULL, n_quad = 200L) { J_fine <- score(fine_u) edges <- seq(0, 1, length.out = n + 1L) cells <- findInterval(edges, fine_u, all.inside = TRUE) - cells[length(cells)] <- length(fine_u) - 1L # last edge + cells[length(cells)] <- length(fine_u) - 1L # last edge weights <- numeric(n) for (i in seq_len(n)) { - a <- cells[i]; b <- cells[i + 1L] + a <- cells[i] + b <- cells[i + 1L] if (b <= a) b <- a + 1L seg_u <- fine_u[a:(b + 1L)] seg_J <- J_fine[a:(b + 1L)] # trapezoidal rule weights[i] <- sum((seg_J[-length(seg_J)] + seg_J[-1L]) * - diff(seg_u)) / 2 + diff(seg_u)) / 2 } L <- sum(weights * x_sorted) uu <- (seq_len(n_quad) - 0.5) / n_quad - Q <- as.numeric(stats::quantile(x, uu, names = FALSE)) - sigma <- stats::sd(x); if (sigma <= 0) sigma <- 1 - h <- 1.06 * sigma * n^(-1/5) - f_Q <- vapply(Q, function(q) mean(stats::dnorm((q - x) / h) / h), - numeric(1)) + Q <- as.numeric(stats::quantile(x, uu, names = FALSE)) + sigma <- stats::sd(x) + if (sigma <= 0) sigma <- 1 + h <- 1.06 * sigma * n^(-1 / 5) + f_Q <- vapply( + Q, function(q) mean(stats::dnorm((q - x) / h) / h), + numeric(1) + ) J_at_u <- score(uu) U <- outer(uu, uu, pmin) Kmat <- (U - outer(uu, uu)) / (outer(f_Q, f_Q) + 1e-12) JJ <- outer(J_at_u, J_at_u) var <- mean(JJ * Kmat) / n var <- max(var, 0) - list(estimate = L, se = sqrt(var), n = n, - method = "Fauzi L-statistic with user score function J (Ch 5)") + list( + estimate = L, se = sqrt(var), n = n, + method = "Fauzi L-statistic with user score function J (Ch 5)" + ) } # CANONICAL TEST @@ -64,4 +72,4 @@ fzlst <- function(x, score = NULL, n_quad = 200L) { #' @rdname fzlst #' @keywords internal #' @export -fauzi_l_statistic <- fzlst +morie_fauzi_l_statistic <- fzlst diff --git a/r-package/morie/R/fzmis.R b/r-package/morie/R/fzmis.R index 1dab3f9462..e6390f9fd2 100644 --- a/r-package/morie/R/fzmis.R +++ b/r-package/morie/R/fzmis.R @@ -12,26 +12,32 @@ #' R_fpp, sigma, n, method. #' @importFrom stats sd #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzmis(x = rnorm(50)) #' @export fzmis <- function(x, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(estimate = NA_real_, n = n, - method = "fzmis - too few obs")) - sigma <- stats::sd(x); if (sigma <= 0) sigma <- 1 - mu2 <- 1; R_K <- 1 / (2 * sqrt(pi)) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + estimate = NA_real_, n = n, + method = "fzmis - too few obs" + )) + } + sigma <- stats::sd(x) + if (sigma <= 0) sigma <- 1 + mu2 <- 1 + R_K <- 1 / (2 * sqrt(pi)) R_fpp <- 3 / (8 * sqrt(pi) * sigma^5) if (is.null(h)) h <- .morie_silverman_h(x) bias_part <- (h^4 / 4) * mu2^2 * R_fpp - var_part <- R_K / (n * h) + var_part <- R_K / (n * h) mise <- bias_part + var_part - h_opt <- (R_K / (n * mu2^2 * R_fpp))^(1/5) - list(estimate = mise, bias_part = bias_part, var_part = var_part, - h = h, h_opt = h_opt, R_fpp = R_fpp, sigma = sigma, n = n, - method = "Fauzi MISE decomposition (Ch 1)") + h_opt <- (R_K / (n * mu2^2 * R_fpp))^(1 / 5) + list( + estimate = mise, bias_part = bias_part, var_part = var_part, + h = h, h_opt = h_opt, R_fpp = R_fpp, sigma = sigma, n = n, + method = "Fauzi MISE decomposition (Ch 1)" + ) } # CANONICAL TEST @@ -41,4 +47,4 @@ fzmis <- function(x, h = NULL) { #' @rdname fzmis #' @keywords internal #' @export -fauzi_mise_computation <- fzmis +morie_fauzi_mise_computation <- fzmis diff --git a/r-package/morie/R/fzmrb.R b/r-package/morie/R/fzmrb.R index 6e02beeaae..6bdc14c80a 100644 --- a/r-package/morie/R/fzmrb.R +++ b/r-package/morie/R/fzmrb.R @@ -11,33 +11,47 @@ #' @return Named list: estimate, se, S_hat, t, h, n, method. #' @importFrom stats median pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export fzmrb <- function(x, t = NULL, h = NULL) { - x <- as.numeric(x); n <- length(x) + x <- as.numeric(x) + n <- length(x) if (any(x <= 0)) stop("fzmrb requires strictly positive x") - if (n < 2L) return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "fzmrb - too few obs")) + if (n < 2L) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "fzmrb - too few obs" + )) + } if (is.null(t)) t <- stats::median(x) if (t <= 0) stop("t must be positive") y <- log(x) if (is.null(h)) h <- .morie_silverman_h(y) s <- log(t) S_y <- mean(1 - stats::pnorm((s - y) / h)) - if (S_y <= 0) return(list(estimate = NA_real_, S_hat = S_y, n = n, t = t, - method = "fzmrb - S(t)=0")) - d <- x - t; above <- d > 0 - if (!any(above)) return(list(estimate = 0, S_hat = S_y, n = n, t = t, - method = "fzmrb - no x>t")) + if (S_y <= 0) { + return(list( + estimate = NA_real_, S_hat = S_y, n = n, t = t, + method = "fzmrb - S(t)=0" + )) + } + d <- x - t + above <- d > 0 + if (!any(above)) { + return(list( + estimate = 0, S_hat = S_y, n = n, t = t, + method = "fzmrb - no x>t" + )) + } m_hat <- mean(d[above]) second <- mean((d[above])^2) sigma2 <- max((second - m_hat^2) / S_y, 0) - list(estimate = m_hat, se = sqrt(sigma2 / n), - S_hat = S_y, t = t, h = h, n = n, - method = "Fauzi boundary-free MRL via log-bijection (Ch 4)") + list( + estimate = m_hat, se = sqrt(sigma2 / n), + S_hat = S_y, t = t, h = h, n = n, + method = "Fauzi boundary-free MRL via log-bijection (Ch 4)" + ) } # CANONICAL TEST @@ -47,4 +61,4 @@ fzmrb <- function(x, t = NULL, h = NULL) { #' @rdname fzmrb #' @keywords internal #' @export -fauzi_mrl_boundary_free <- fzmrb +morie_fauzi_mrl_boundary_free <- fzmrb diff --git a/r-package/morie/R/fzmrl.R b/r-package/morie/R/fzmrl.R index 0f698b23d8..229268730d 100644 --- a/r-package/morie/R/fzmrl.R +++ b/r-package/morie/R/fzmrl.R @@ -11,31 +11,44 @@ #' @return Named list with estimate, se, S_hat, t, h, n, method. #' @importFrom stats median pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzmrl(x = rnorm(50)) #' @export fzmrl <- function(x, t = NULL, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "fzmrl - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "fzmrl - too few obs" + )) + } if (is.null(t)) t <- stats::median(x) if (is.null(h)) h <- .morie_silverman_h(x) S_t <- mean(1 - stats::pnorm((t - x) / h)) - if (S_t <= 0) return(list(estimate = NA_real_, se = NA_real_, - S_hat = S_t, n = n, t = t, - method = "fzmrl - S(t)=0")) - d <- x - t; above <- d > 0 - if (!any(above)) return(list(estimate = 0, se = NA_real_, - S_hat = S_t, n = n, t = t, - method = "fzmrl - no x>t")) + if (S_t <= 0) { + return(list( + estimate = NA_real_, se = NA_real_, + S_hat = S_t, n = n, t = t, + method = "fzmrl - S(t)=0" + )) + } + d <- x - t + above <- d > 0 + if (!any(above)) { + return(list( + estimate = 0, se = NA_real_, + S_hat = S_t, n = n, t = t, + method = "fzmrl - no x>t" + )) + } m_hat <- mean(d[above]) second <- mean((d[above])^2) sigma2 <- max((second - m_hat^2) / S_t, 0) - list(estimate = m_hat, se = sqrt(sigma2 / n), - S_hat = S_t, t = t, h = h, n = n, - method = "Fauzi kernel MRL asymptotic (Ch 4)") + list( + estimate = m_hat, se = sqrt(sigma2 / n), + S_hat = S_t, t = t, h = h, n = n, + method = "Fauzi kernel MRL asymptotic (Ch 4)" + ) } # CANONICAL TEST @@ -45,4 +58,4 @@ fzmrl <- function(x, t = NULL, h = NULL) { #' @rdname fzmrl #' @keywords internal #' @export -fauzi_mrl_asymptotic <- fzmrl +morie_fauzi_mrl_asymptotic <- fzmrl diff --git a/r-package/morie/R/fzqnt.R b/r-package/morie/R/fzqnt.R index 069982ffc0..6490866f90 100644 --- a/r-package/morie/R/fzqnt.R +++ b/r-package/morie/R/fzqnt.R @@ -10,28 +10,33 @@ #' @return Named list with estimate, se, p, h, density_at_Q, n, method. #' @importFrom stats sd quantile dnorm pnorm uniroot #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzqnt(x = rnorm(50)) #' @export fzqnt <- function(x, p = 0.5, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "fzqnt - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "fzqnt - too few obs" + )) + } if (p <= 0 || p >= 1) stop("p must be in (0,1)") if (is.null(h)) h <- .morie_silverman_h(x) Fhat <- function(t) mean(stats::pnorm((t - x) / h)) - lo <- min(x) - 5 * h - 1e-9; hi <- max(x) + 5 * h + 1e-9 + lo <- min(x) - 5 * h - 1e-9 + hi <- max(x) + 5 * h + 1e-9 q_hat <- tryCatch( stats::uniroot(function(t) Fhat(t) - p, c(lo, hi))$root, error = function(e) as.numeric(stats::quantile(x, p, names = FALSE)) ) f_q <- mean(stats::dnorm((q_hat - x) / h) / h) se <- if (f_q > 0) sqrt(p * (1 - p) / n) / f_q else NA_real_ - list(estimate = q_hat, se = se, p = p, h = h, - density_at_Q = f_q, n = n, - method = "Fauzi kernel quantile asymptotic (Ch 3)") + list( + estimate = q_hat, se = se, p = p, h = h, + density_at_Q = f_q, n = n, + method = "Fauzi kernel quantile asymptotic (Ch 3)" + ) } # CANONICAL TEST @@ -41,4 +46,4 @@ fzqnt <- function(x, p = 0.5, h = NULL) { #' @rdname fzqnt #' @keywords internal #' @export -fauzi_kernel_quantile_asymptotic <- fzqnt +morie_fauzi_kernel_quantile_asymptotic <- fzqnt diff --git a/r-package/morie/R/fzsgn.R b/r-package/morie/R/fzsgn.R index c19b928ff7..b3e73206c0 100644 --- a/r-package/morie/R/fzsgn.R +++ b/r-package/morie/R/fzsgn.R @@ -12,26 +12,31 @@ #' @return Named list with statistic, z, p_value, theta0, h, n, method. #' @importFrom stats pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzsgn(x = rnorm(50)) #' @export fzsgn <- function(x, theta0 = 0, h = NULL, alternative = "two-sided") { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "fzsgn - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "fzsgn - too few obs" + )) + } if (is.null(h)) h <- .morie_silverman_h(x) S_n <- sum(stats::pnorm((x - theta0) / h)) z <- (S_n - n / 2) / sqrt(n / 4) p <- switch(alternative, - "two-sided" = 2 * (1 - stats::pnorm(abs(z))), - "greater" = 1 - stats::pnorm(z), - "less" = stats::pnorm(z), - stop("alternative must be two-sided/greater/less")) - list(statistic = S_n, z = z, p_value = p, - theta0 = theta0, h = h, n = n, - method = sprintf("Fauzi smoothed sign test (%s) (Ch 5)", alternative)) + "two-sided" = 2 * (1 - stats::pnorm(abs(z))), + "greater" = 1 - stats::pnorm(z), + "less" = stats::pnorm(z), + stop("alternative must be two-sided/greater/less") + ) + list( + statistic = S_n, z = z, p_value = p, + theta0 = theta0, h = h, n = n, + method = sprintf("Fauzi smoothed sign test (%s) (Ch 5)", alternative) + ) } # CANONICAL TEST @@ -40,4 +45,4 @@ fzsgn <- function(x, theta0 = 0, h = NULL, alternative = "two-sided") { #' @rdname fzsgn #' @keywords internal #' @export -fauzi_smoothed_sign <- fzsgn +morie_fauzi_smoothed_sign <- fzsgn diff --git a/r-package/morie/R/fzsrv.R b/r-package/morie/R/fzsrv.R index d5de6de552..630c1d0dbc 100644 --- a/r-package/morie/R/fzsrv.R +++ b/r-package/morie/R/fzsrv.R @@ -10,26 +10,30 @@ #' @return Named list with estimate, se, ci_lower, ci_upper, t, h, n, method. #' @importFrom stats median pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzsrv(x = rnorm(50)) #' @export fzsrv <- function(x, t = NULL, h = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "fzsrv - too few obs")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "fzsrv - too few obs" + )) + } if (is.null(t)) t <- stats::median(x) if (is.null(h)) h <- .morie_silverman_h(x) F_hat <- mean(stats::pnorm((t - x) / h)) S_hat <- 1 - F_hat se <- sqrt(S_hat * (1 - S_hat) / n) z <- 1.959963984540054 - list(estimate = S_hat, se = se, - ci_lower = max(0, S_hat - z * se), - ci_upper = min(1, S_hat + z * se), - t = t, h = h, n = n, - method = "Fauzi kernel survival S_hat(t)=1-F_hat_h(t) (Ch 4)") + list( + estimate = S_hat, se = se, + ci_lower = max(0, S_hat - z * se), + ci_upper = min(1, S_hat + z * se), + t = t, h = h, n = n, + method = "Fauzi kernel survival S_hat(t)=1-F_hat_h(t) (Ch 4)" + ) } # CANONICAL TEST @@ -39,4 +43,4 @@ fzsrv <- function(x, t = NULL, h = NULL) { #' @rdname fzsrv #' @keywords internal #' @export -fauzi_survival_kernel <- fzsrv +morie_fauzi_survival_kernel <- fzsrv diff --git a/r-package/morie/R/fzwlc.R b/r-package/morie/R/fzwlc.R index 3feab74a10..2754f1324b 100644 --- a/r-package/morie/R/fzwlc.R +++ b/r-package/morie/R/fzwlc.R @@ -12,20 +12,30 @@ #' @return Named list with statistic, z, p_value, theta0, h, n, method. #' @importFrom stats pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' fzwlc(x = rnorm(50)) #' @export fzwlc <- function(x, theta0 = 0, h = NULL, alternative = "two-sided") { - x <- as.numeric(x); n <- length(x) - if (n < 5L) return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "fzwlc - too few obs")) - d <- x - theta0; ad <- abs(d); nz <- ad > 1e-12 - d <- d[nz]; ad <- ad[nz]; n_eff <- length(d) - if (n_eff < 5L) return(list(statistic = NA_real_, p_value = NA_real_, - n = n_eff, - method = "fzwlc - too few nonzero")) + x <- as.numeric(x) + n <- length(x) + if (n < 5L) { + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "fzwlc - too few obs" + )) + } + d <- x - theta0 + ad <- abs(d) + nz <- ad > 1e-12 + d <- d[nz] + ad <- ad[nz] + n_eff <- length(d) + if (n_eff < 5L) { + return(list( + statistic = NA_real_, p_value = NA_real_, + n = n_eff, + method = "fzwlc - too few nonzero" + )) + } if (is.null(h)) h <- .morie_silverman_h(ad) D <- outer(ad, ad, function(a, b) (a - b) / h) R_smooth <- rowSums(stats::pnorm(D)) @@ -33,14 +43,19 @@ fzwlc <- function(x, theta0 = 0, h = NULL, alternative = "two-sided") { var <- n_eff * (n_eff + 1) * (2 * n_eff + 1) / 6 z <- W_n / sqrt(var) p <- switch(alternative, - "two-sided" = 2 * (1 - stats::pnorm(abs(z))), - "greater" = 1 - stats::pnorm(z), - "less" = stats::pnorm(z), - stop("alternative must be two-sided/greater/less")) - list(statistic = W_n, z = z, p_value = p, - theta0 = theta0, h = h, n = n_eff, - method = sprintf("Fauzi smoothed Wilcoxon signed-rank (%s) (Ch 5)", - alternative)) + "two-sided" = 2 * (1 - stats::pnorm(abs(z))), + "greater" = 1 - stats::pnorm(z), + "less" = stats::pnorm(z), + stop("alternative must be two-sided/greater/less") + ) + list( + statistic = W_n, z = z, p_value = p, + theta0 = theta0, h = h, n = n_eff, + method = sprintf( + "Fauzi smoothed Wilcoxon signed-rank (%s) (Ch 5)", + alternative + ) + ) } # CANONICAL TEST @@ -49,4 +64,4 @@ fzwlc <- function(x, theta0 = 0, h = NULL, alternative = "two-sided") { #' @rdname fzwlc #' @keywords internal #' @export -fauzi_smoothed_wilcoxon <- fzwlc +morie_fauzi_smoothed_wilcoxon <- fzwlc diff --git a/r-package/morie/R/ganls.R b/r-package/morie/R/ganls.R index 8e9a747756..5f7446c093 100644 --- a/r-package/morie/R/ganls.R +++ b/r-package/morie/R/ganls.R @@ -16,13 +16,11 @@ #' @return Named list \code{(d_loss, g_loss, v, estimate, kind, method)}. #' @references Goodfellow et al. (2014), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ganls_gan_loss(D_real = rnorm(20), D_fake = rnorm(20)) #' @export -ganls_gan_loss <- function(D_real, D_fake, kind = "minimax") { - D_real <- as.numeric(D_real); D_fake <- as.numeric(D_fake) +morie_ganls_gan_loss <- function(D_real, D_fake, kind = "minimax") { + D_real <- as.numeric(D_real) + D_fake <- as.numeric(D_fake) clip_log <- function(p) log(pmin(pmax(p, 1e-12), 1.0)) v_real <- mean(clip_log(D_real)) v_fake_neg <- mean(clip_log(1 - D_fake)) @@ -33,12 +31,14 @@ ganls_gan_loss <- function(D_real, D_fake, kind = "minimax") { "nonsaturating" = -mean(clip_log(D_fake)), stop(sprintf("kind must be 'minimax' or 'nonsaturating', got %s", kind)) ) - list(d_loss = d_loss, g_loss = g_loss, v = V, - estimate = d_loss, kind = kind, - method = sprintf("GAN %s loss", kind)) + list( + d_loss = d_loss, g_loss = g_loss, v = V, + estimate = d_loss, kind = kind, + method = sprintf("GAN %s loss", kind) + ) } -#' @rdname ganls_gan_loss +#' @rdname morie_ganls_gan_loss #' @keywords internal #' @export -gan_loss <- ganls_gan_loss +morie_gan_loss <- morie_ganls_gan_loss diff --git a/r-package/morie/R/garch.R b/r-package/morie/R/garch.R index f3f81860fa..6359abd739 100644 --- a/r-package/morie/R/garch.R +++ b/r-package/morie/R/garch.R @@ -1,5 +1,21 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: GARCH(1,1) Gaussian negative log-likelihood for the base-R +# fallback. Extracted from the morie_garch_fit() optimiser closure so the +# parameter-domain guard is directly unit-testable. +.garch_negll <- function(p, r, n) { + omega <- p[1] + alpha <- p[2] + beta <- p[3] + if (omega <= 0 || alpha < 0 || beta < 0 || alpha + beta >= 1) { + return(1e10) + } + s2 <- numeric(n) + s2[1] <- var(r) + for (t in 2:n) s2[t] <- max(omega + alpha * r[t - 1]^2 + beta * s2[t - 1], 1e-12) + 0.5 * sum(log(2 * pi * s2) + r^2 / s2) +} + #' Fit a GARCH(1,1) model to a return series #' #' \deqn{\sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2.} @@ -8,14 +24,12 @@ #' @return Named list with \code{omega, alpha, beta, persistence, loglik, #' conditional_variance, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_garch_fit(x = rnorm(50)) #' @export -garch_fit <- function(x) { +morie_garch_fit <- function(x) { r <- as.numeric(x) - mean(as.numeric(x)) - n <- length(r); if (n < 10) stop("Need >=10 obs.") + n <- length(r) + if (n < 10) stop("Need >=10 obs.") if (requireNamespace("rugarch", quietly = TRUE)) { spec <- rugarch::ugarchspec( variance.model = list(model = "sGARCH", garchOrder = c(1, 1)), @@ -23,30 +37,32 @@ garch_fit <- function(x) { ) fit <- rugarch::ugarchfit(spec, r, solver = "hybrid") p <- rugarch::coef(fit) - return(list(omega = unname(p["omega"]), alpha = unname(p["alpha1"]), - beta = unname(p["beta1"]), - persistence = unname(p["alpha1"] + p["beta1"]), - loglik = as.numeric(rugarch::likelihood(fit)), - conditional_variance = as.numeric(rugarch::sigma(fit))^2, - n = n, - method = "GARCH(1,1) via rugarch")) - } - neg_ll <- function(p) { - omega <- p[1]; alpha <- p[2]; beta <- p[3] - if (omega <= 0 || alpha < 0 || beta < 0 || alpha + beta >= 1) return(1e10) - s2 <- numeric(n); s2[1] <- var(r) - for (t in 2:n) s2[t] <- max(omega + alpha * r[t - 1]^2 + beta * s2[t - 1], 1e-12) - 0.5 * sum(log(2 * pi * s2) + r^2 / s2) + return(list( + omega = unname(p["omega"]), alpha = unname(p["alpha1"]), + beta = unname(p["beta1"]), + persistence = unname(p["alpha1"] + p["beta1"]), + loglik = as.numeric(rugarch::likelihood(fit)), + conditional_variance = as.numeric(rugarch::sigma(fit))^2, + n = n, + method = "GARCH(1,1) via rugarch" + )) } + neg_ll <- function(p) .garch_negll(p, r, n) var_r <- var(r) opt <- nlminb(c(var_r * 0.05, 0.1, 0.85), neg_ll, - lower = c(1e-8, 1e-8, 1e-8), - upper = c(var_r * 10, 0.999, 0.999)) - omega <- opt$par[1]; alpha <- opt$par[2]; beta <- opt$par[3] - s2 <- numeric(n); s2[1] <- var_r + lower = c(1e-8, 1e-8, 1e-8), + upper = c(var_r * 10, 0.999, 0.999) + ) + omega <- opt$par[1] + alpha <- opt$par[2] + beta <- opt$par[3] + s2 <- numeric(n) + s2[1] <- var_r for (t in 2:n) s2[t] <- omega + alpha * r[t - 1]^2 + beta * s2[t - 1] - list(omega = omega, alpha = alpha, beta = beta, - persistence = alpha + beta, loglik = -opt$objective, - conditional_variance = s2, n = n, - method = "GARCH(1,1) Gaussian MLE (base R)") + list( + omega = omega, alpha = alpha, beta = beta, + persistence = alpha + beta, loglik = -opt$objective, + conditional_variance = s2, n = n, + method = "GARCH(1,1) Gaussian MLE (base R)" + ) } diff --git a/r-package/morie/R/gbens.R b/r-package/morie/R/gbens.R index 0e3e46b78a..f44460bb1b 100644 --- a/r-package/morie/R/gbens.R +++ b/r-package/morie/R/gbens.R @@ -20,21 +20,22 @@ #' n_estimators, learning_rate, max_depth, task, n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_gradient_boosting_ensemble(x = rnorm(50), y = rnorm(50)) #' @export -gradient_boosting_ensemble <- function(x, y, n_estimators = 100L, - learning_rate = 0.1, - max_depth = 3L, - task = "auto", seed = 0L, - deterministic_seed = NULL) { +morie_gradient_boosting_ensemble <- function(x, y, n_estimators = 100L, + learning_rate = 0.1, + max_depth = 3L, + task = "auto", seed = 0L, + deterministic_seed = NULL) { + x <- .morie_ensure_design_matrix(x) if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) if (identical(task, "auto")) { - task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) - "classification" else "regression" + task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) { + "classification" + } else { + "regression" + } } n <- nrow(x) if (!is.null(deterministic_seed)) { @@ -43,12 +44,15 @@ gradient_boosting_ensemble <- function(x, y, n_estimators = 100L, set.seed(seed) } if (requireNamespace("gbm", quietly = TRUE)) { - df <- as.data.frame(x); df$.y <- if (task == "classification") as.numeric(as.factor(y)) - 1 else as.numeric(y) + df <- as.data.frame(x) + df$.y <- if (task == "classification") as.numeric(as.factor(y)) - 1 else as.numeric(y) distribution <- if (task == "classification") "bernoulli" else "gaussian" - fit <- gbm::gbm(.y ~ ., data = df, distribution = distribution, - n.trees = n_estimators, interaction.depth = max_depth, - shrinkage = learning_rate, bag.fraction = 1.0, - verbose = FALSE) + fit <- gbm::gbm(.y ~ ., + data = df, distribution = distribution, + n.trees = n_estimators, interaction.depth = max_depth, + shrinkage = learning_rate, bag.fraction = 1.0, + verbose = FALSE + ) p <- gbm::predict.gbm(fit, df, n.trees = n_estimators, type = "response") if (task == "classification") { preds <- as.integer(p > 0.5) @@ -57,19 +61,22 @@ gradient_boosting_ensemble <- function(x, y, n_estimators = 100L, train_score <- 1 - sum((p - df$.y)^2) / sum((df$.y - mean(df$.y))^2) } rel <- summary(fit, n.trees = n_estimators, plotit = FALSE) - fi <- rep(0, ncol(x)); names(fi) <- colnames(df)[seq_len(ncol(x))] + fi <- rep(0, ncol(x)) + names(fi) <- colnames(df)[seq_len(ncol(x))] fi[as.character(rel$var)] <- rel$rel.inf / 100 backend <- "gbm" } else { # xgboost fallback if (!requireNamespace("xgboost", quietly = TRUE)) { - stop("install either 'gbm' or 'xgboost' for gradient_boosting_ensemble") + stop("install either 'gbm' or 'xgboost' for morie_gradient_boosting_ensemble") } yv <- if (task == "classification") as.numeric(as.factor(y)) - 1 else as.numeric(y) obj <- if (task == "classification") "binary:logistic" else "reg:squarederror" - fit <- xgboost::xgboost(data = x, label = yv, nrounds = n_estimators, - eta = learning_rate, max_depth = max_depth, - objective = obj, verbose = 0L) + fit <- xgboost::xgboost( + data = x, label = yv, nrounds = n_estimators, + eta = learning_rate, max_depth = max_depth, + objective = obj, verbose = 0L + ) p <- predict(fit, x) if (task == "classification") { preds <- as.integer(p > 0.5) @@ -78,7 +85,8 @@ gradient_boosting_ensemble <- function(x, y, n_estimators = 100L, train_score <- 1 - sum((p - yv)^2) / sum((yv - mean(yv))^2) } imp <- xgboost::xgb.importance(model = fit) - fi <- rep(0, ncol(x)); names(fi) <- colnames(x) %||% paste0("V", seq_len(ncol(x))) + fi <- rep(0, ncol(x)) + names(fi) <- colnames(x) %||% paste0("V", seq_len(ncol(x))) if (nrow(imp) > 0) fi[imp$Feature] <- imp$Gain backend <- "xgboost" } diff --git a/r-package/morie/R/gbgen.R b/r-package/morie/R/gbgen.R index c852895825..902868ad34 100644 --- a/r-package/morie/R/gbgen.R +++ b/r-package/morie/R/gbgen.R @@ -14,25 +14,38 @@ #' @return list(estimate, y_hat, train_loss, se, n, method). #' @references Friedman (2001); Montesinos Lopez Ch 9. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_gradient_boosting_genomic( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -gradient_boosting_genomic <- function(x, y, markers, n_estimators = 100, - learning_rate = 0.1, max_depth = 3, - seed = 0) { +morie_gradient_boosting_genomic <- function(x, y, markers, n_estimators = 100, + learning_rate = 0.1, max_depth = 3, + seed = 0) { set.seed(seed) - y <- as.numeric(y); n <- length(y) + y <- as.numeric(y) + n <- length(y) M <- as.matrix(markers) - feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) M - else cbind(as.matrix(x), M) + feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { + M + } else { + cbind(as.matrix(x), M) + } + # A zero-variance predictor carries no signal and makes gbm warn + # ("variable has no variation"); drop any constant columns first. + if (ncol(feats) > 1L) { + keep <- apply(feats, 2L, function(col) stats::var(col) > 0) + if (any(keep) && !all(keep)) feats <- feats[, keep, drop = FALSE] + } if (requireNamespace("gbm", quietly = TRUE)) { - df <- data.frame(y = y, feats); colnames(df)[-1] <- paste0("V", seq_len(ncol(feats))) - gb <- gbm::gbm(y ~ ., data = df, distribution = "gaussian", - n.trees = n_estimators, shrinkage = learning_rate, - interaction.depth = max_depth, n.minobsinnode = 2, - bag.fraction = 1, verbose = FALSE) + df <- data.frame(y = y, feats) + colnames(df)[-1] <- paste0("V", seq_len(ncol(feats))) + gb <- gbm::gbm(y ~ ., + data = df, distribution = "gaussian", + n.trees = n_estimators, shrinkage = learning_rate, + interaction.depth = max_depth, n.minobsinnode = 2, + bag.fraction = 1, verbose = FALSE + ) y_hat <- as.numeric(stats::predict(gb, df, n.trees = n_estimators)) train_loss <- as.numeric(gb$train.error) method_used <- "gbm::gbm" @@ -52,18 +65,23 @@ gradient_boosting_genomic <- function(x, y, markers, n_estimators = 100, for (thr in mids) { lf <- feats[, f] <= thr if (sum(lf) < 1 || sum(!lf) < 1) next - lv <- mean(r[lf]); rv <- mean(r[!lf]) + lv <- mean(r[lf]) + rv <- mean(r[!lf]) sse <- sum((r[lf] - lv)^2) + sum((r[!lf] - rv)^2) gain <- sum(r^2) - sse - if (is.null(best) || gain > best$gain) - best <- list(gain = gain, feature = f, threshold = thr, - left_val = lv, right_val = rv) + if (is.null(best) || gain > best$gain) { + best <- list( + gain = gain, feature = f, threshold = thr, + left_val = lv, right_val = rv + ) + } } } stumps[[b_]] <- best if (!is.null(best)) { pred <- ifelse(feats[, best$feature] <= best$threshold, - best$left_val, best$right_val) + best$left_val, best$right_val + ) F_pred <- F_pred + learning_rate * pred } train_loss[b_] <- mean((y - F_pred)^2) @@ -71,10 +89,12 @@ gradient_boosting_genomic <- function(x, y, markers, n_estimators = 100, y_hat <- F_pred } resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, train_loss = train_loss, - se = sqrt(mean(resid^2)), n = n, method = method_used) + list( + estimate = mean(y_hat), y_hat = y_hat, train_loss = train_loss, + se = sqrt(mean(resid^2)), n = n, method = method_used + ) } # CANONICAL TEST # set.seed(14); M <- matrix(rnorm(160), 40, 4); y <- sign(M[,1])+0.3*rnorm(40) -# gradient_boosting_genomic(rep(0,40), y, M, n_estimators=20, seed=14) +# morie_gradient_boosting_genomic(rep(0,40), y, M, n_estimators=20, seed=14) diff --git a/r-package/morie/R/gblpf.R b/r-package/morie/R/gblpf.R index ca0015ca9a..950b0103c6 100644 --- a/r-package/morie/R/gblpf.R +++ b/r-package/morie/R/gblpf.R @@ -11,15 +11,13 @@ #' @return Named list (estimate, g_hat, beta, se, lambda_gblup, n, method). #' @references Montesinos Lopez Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_gblup_full(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -gblup_full <- function(x, y, markers, lambda_gblup = NULL) { - y <- as.numeric(y); n <- length(y) +morie_gblup_full <- function(x, y, markers, lambda_gblup = NULL) { + y <- as.numeric(y) + n <- length(y) M <- as.matrix(markers) - G <- grm_vanraden(M, method = 1)$estimate + G <- morie_grm_vanraden(M, method = 1)$estimate G <- G + 1e-6 * diag(n) cand <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { matrix(1, n, 1) @@ -30,14 +28,16 @@ gblup_full <- function(x, y, markers, lambda_gblup = NULL) { X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] if (is.null(lambda_gblup)) { var_y <- if (n > 1) stats::var(y) else 1 - lam <- (0.5 * var_y) / (0.5 * var_y) # = 1 with h^2=0.5 + lam <- (0.5 * var_y) / (0.5 * var_y) # = 1 with h^2=0.5 } else { lam <- as.numeric(lambda_gblup) } Ginv <- solve(G) p <- ncol(X) - C <- rbind(cbind(crossprod(X), t(X)), - cbind(X, diag(n) + lam * Ginv)) + C <- rbind( + cbind(crossprod(X), t(X)), + cbind(X, diag(n) + lam * Ginv) + ) rhs <- c(crossprod(X, y), y) sol <- solve(C, rhs) beta <- sol[seq_len(p)] @@ -45,12 +45,14 @@ gblup_full <- function(x, y, markers, lambda_gblup = NULL) { y_hat <- X %*% beta + g_hat resid <- y - as.numeric(y_hat) se <- sqrt(sum(resid^2) / max(n - p, 1)) - list(estimate = mean(g_hat), g_hat = g_hat, beta = beta, - se = se, y_hat = as.numeric(y_hat), - lambda_gblup = lam, n = n, - method = "GBLUP with VanRaden G") + list( + estimate = mean(g_hat), g_hat = g_hat, beta = beta, + se = se, y_hat = as.numeric(y_hat), + lambda_gblup = lam, n = n, + method = "GBLUP with VanRaden G" + ) } # CANONICAL TEST # set.seed(0); M <- matrix(sample(0:2, 20, TRUE), 4, 5) -# gblup_full(rep(0, 4), c(1, 2, 3, 2.5), M) +# morie_gblup_full(rep(0, 4), c(1, 2, 3, 2.5), M) diff --git a/r-package/morie/R/gcvgn.R b/r-package/morie/R/gcvgn.R index ccf724f8e0..929a67862a 100644 --- a/r-package/morie/R/gcvgn.R +++ b/r-package/morie/R/gcvgn.R @@ -10,40 +10,53 @@ #' @return list(estimate, r_per_fold, y_hat, mse, mspe, slope, n, K, method). #' @references Montesinos Lopez Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_genomic_cross_validation(x = rnorm(50), y = rnorm(50)) #' @export -genomic_cross_validation <- function(x, y, K = 5, lam = 1.0, seed = 0) { +morie_genomic_cross_validation <- function(x, y, K = 5, lam = 1.0, seed = 0) { set.seed(seed) - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) idx <- sample.int(n) folds <- split(idx, cut(seq_along(idx), K, labels = FALSE)) - y_hat <- rep(0, n); r_per_fold <- numeric(K) + y_hat <- rep(0, n) + r_per_fold <- numeric(K) for (k in seq_len(K)) { test <- folds[[k]] train <- setdiff(seq_len(n), test) - Xtr <- X[train, , drop = FALSE]; ytr <- y[train] + Xtr <- X[train, , drop = FALSE] + ytr <- y[train] Xte <- X[test, , drop = FALSE] - mu <- mean(ytr); x_mu <- colMeans(Xtr) + mu <- mean(ytr) + x_mu <- colMeans(Xtr) Xtr_c <- sweep(Xtr, 2, x_mu) - beta <- solve(crossprod(Xtr_c) + lam * diag(p), - crossprod(Xtr_c, ytr - mu)) + beta <- solve( + crossprod(Xtr_c) + lam * diag(p), + crossprod(Xtr_c, ytr - mu) + ) y_hat[test] <- as.numeric(sweep(Xte, 2, x_mu) %*% beta) + mu - if (length(test) > 1 && stats::sd(y[test]) > 0 && stats::sd(y_hat[test]) > 0) + if (length(test) > 1 && stats::sd(y[test]) > 0 && stats::sd(y_hat[test]) > 0) { r_per_fold[k] <- stats::cor(y[test], y_hat[test]) - else r_per_fold[k] <- NA_real_ + } else { + r_per_fold[k] <- NA_real_ + } } r_pooled <- if (stats::sd(y_hat) > 0) stats::cor(y, y_hat) else NA_real_ - mse <- mean((y - y_hat)^2); mspe <- mse - slope <- if (stats::var(y_hat) > 0) - stats::cov(y_hat, y) / stats::var(y) else NA_real_ - list(estimate = r_pooled, r_per_fold = r_per_fold, - y_hat = y_hat, mse = mse, mspe = mspe, slope = slope, - n = n, K = K, method = "K-fold cross-validation (ridge)") + mse <- mean((y - y_hat)^2) + mspe <- mse + slope <- if (stats::var(y_hat) > 0) { + stats::cov(y_hat, y) / stats::var(y) + } else { + NA_real_ + } + list( + estimate = r_pooled, r_per_fold = r_per_fold, + y_hat = y_hat, mse = mse, mspe = mspe, slope = slope, + n = n, K = K, method = "K-fold cross-validation (ridge)" + ) } # CANONICAL TEST # set.seed(15); X <- matrix(rnorm(200), 50, 4); b <- c(1,-1,0.5,0) -# y <- X %*% b + 0.3*rnorm(50); genomic_cross_validation(X, y, K=5, seed=15) +# y <- X %*% b + 0.3*rnorm(50); morie_genomic_cross_validation(X, y, K=5, seed=15) diff --git a/r-package/morie/R/ghadp.R b/r-package/morie/R/ghadp.R index d27b78a5d5..49ba1abc76 100644 --- a/r-package/morie/R/ghadp.R +++ b/r-package/morie/R/ghadp.R @@ -7,17 +7,16 @@ #' @param d Integer dimension (default 1). #' @return Named list with estimate, betas, rates, best_beta, n, d, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_adaptation(x = rnorm(50)) #' @export -ghosal_adaptation <- function(x, betas = NULL, d = 1) { +morie_ghosal_adaptation <- function(x, betas = NULL, d = 1) { n <- length(x) if (is.null(betas)) betas <- seq(0.5, 3.0, length.out = 11) - rates <- n^(-betas / (2*betas + d)) + rates <- n^(-betas / (2 * betas + d)) best <- which.min(rates) - list(estimate = rates[best], betas = betas, rates = rates, - best_beta = betas[best], n = n, d = d, - method = "Adaptive posterior contraction over Holder grid") + list( + estimate = rates[best], betas = betas, rates = rates, + best_beta = betas[best], n = n, d = d, + method = "Adaptive posterior contraction over Holder grid" + ) } diff --git a/r-package/morie/R/ghbvm.R b/r-package/morie/R/ghbvm.R index 600784c7b4..abe16f17bf 100644 --- a/r-package/morie/R/ghbvm.R +++ b/r-package/morie/R/ghbvm.R @@ -13,37 +13,45 @@ #' @return Named list with estimate, se, theta_hat, z_ks_stat, z_ks_pvalue, #' wald, wald_pvalue, n, B, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_bernstein_von_mises(x = rnorm(50)) #' @export -ghosal_bernstein_von_mises <- function(x, theta0 = NULL, B = 500, seed = 0, - deterministic_seed = NULL) { +morie_ghosal_bernstein_von_mises <- function(x, theta0 = NULL, B = 500, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("ghbvm", deterministic_seed) } else { set.seed(seed) } - x <- as.numeric(x); n <- length(x) - if (n < 2) return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "BvM (n<2)")) - theta_hat <- mean(x); s <- sd(x) + x <- as.numeric(x) + n <- length(x) + if (n < 2) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "BvM (n<2)" + )) + } + theta_hat <- mean(x) + s <- sd(x) draws <- numeric(B) for (b in seq_len(B)) { - g <- stats::rgamma(n, 1, 1); u <- g / sum(g) + g <- stats::rgamma(n, 1, 1) + u <- g / sum(g) draws[b] <- sum(u * x) } z <- (draws - theta_hat) * sqrt(n) / max(s, 1e-12) ks <- suppressWarnings(stats::ks.test(z, "pnorm")) if (!is.null(theta0)) { - sd_d <- sd(draws); wald <- (mean(draws) - theta0) / max(sd_d, 1e-12) + sd_d <- sd(draws) + wald <- (mean(draws) - theta0) / max(sd_d, 1e-12) wald_p <- 2 * (1 - stats::pnorm(abs(wald))) } else { - wald <- NA_real_; wald_p <- NA_real_ + wald <- NA_real_ + wald_p <- NA_real_ } - list(estimate = mean(draws), se = sd(draws), theta_hat = theta_hat, - z_ks_stat = unname(ks$statistic), z_ks_pvalue = ks$p.value, - wald = wald, wald_pvalue = wald_p, n = n, B = B, - method = "BvM for mean functional (Bayesian bootstrap)") + list( + estimate = mean(draws), se = sd(draws), theta_hat = theta_hat, + z_ks_stat = unname(ks$statistic), z_ks_pvalue = ks$p.value, + wald = wald, wald_pvalue = wald_p, n = n, B = B, + method = "BvM for mean functional (Bayesian bootstrap)" + ) } diff --git a/r-package/morie/R/ghcls.R b/r-package/morie/R/ghcls.R index c5ec837cc8..4bae3ea6f8 100644 --- a/r-package/morie/R/ghcls.R +++ b/r-package/morie/R/ghcls.R @@ -10,28 +10,28 @@ #' @param seed Integer RNG seed (default 0). #' @return Named list with estimate, p_hat, accuracy, length_scale, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_np_classification(x = rnorm(50), y = rnorm(50)) #' @export -ghosal_np_classification <- function(x, y, length_scale = NULL, - sigma_f = 1.0, n_iter = 300, seed = 0) { +morie_ghosal_np_classification <- function(x, y, length_scale = NULL, + sigma_f = 1.0, n_iter = 300, seed = 0) { set.seed(seed) - x <- as.matrix(x); y <- as.numeric(y); n <- nrow(x) + x <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(x) y_pm <- 2 * y - 1 sq <- pmax(.gh_pairwise_sq(x), 0) if (is.null(length_scale)) { d <- sqrt(sq[upper.tri(sq)]) length_scale <- if (length(d)) max(stats::median(d[d > 0]), 1e-3) else 1 } - K <- sigma_f^2 * exp(-sq / (2*length_scale^2)) + 1e-6 * diag(n) + K <- sigma_f^2 * exp(-sq / (2 * length_scale^2)) + 1e-6 * diag(n) f <- rep(0, n) for (it in seq_len(n_iter)) { z <- y_pm * f - phi <- stats::dnorm(z); Phi <- pmin(pmax(stats::pnorm(z), 1e-12), 1-1e-12) + phi <- stats::dnorm(z) + Phi <- pmin(pmax(stats::pnorm(z), 1e-12), 1 - 1e-12) grad_ll <- y_pm * phi / Phi - W <- pmax((phi/Phi) * (phi/Phi + z), 1e-8) + W <- pmax((phi / Phi) * (phi / Phi + z), 1e-8) sW <- sqrt(W) # B = I + diag(sW) %*% K %*% diag(sW) B <- diag(n) + (sW %o% sW) * K @@ -40,12 +40,18 @@ ghosal_np_classification <- function(x, y, length_scale = NULL, b <- W * f + grad_ll a <- b - sW * backsolve(Lf, forwardsolve(t(Lf), sW * (K %*% b))) f_new <- as.numeric(K %*% a) - if (max(abs(f_new - f)) < 1e-6) { f <- f_new; break } + if (max(abs(f_new - f)) < 1e-6) { + f <- f_new + break + } f <- f_new } - p_hat <- stats::pnorm(f); pred <- as.integer(p_hat >= 0.5) + p_hat <- stats::pnorm(f) + pred <- as.integer(p_hat >= 0.5) accuracy <- mean(pred == y) - list(estimate = mean(p_hat), p_hat = p_hat, accuracy = accuracy, - length_scale = length_scale, n = n, - method = "Probit-link GP classifier (Laplace)") + list( + estimate = mean(p_hat), p_hat = p_hat, accuracy = accuracy, + length_scale = length_scale, n = n, + method = "Probit-link GP classifier (Laplace)" + ) } diff --git a/r-package/morie/R/ghcon.R b/r-package/morie/R/ghcon.R index 76b49ab576..53fe8efa72 100644 --- a/r-package/morie/R/ghcon.R +++ b/r-package/morie/R/ghcon.R @@ -10,21 +10,23 @@ #' @param seed Integer RNG seed (default 0). #' @return Named list with estimate, ks_mean, ks_se, schwartz_bound, n, eps, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_posterior_consistency(x = rnorm(50)) #' @export -ghosal_posterior_consistency <- function(x, ref_loc = NULL, ref_scale = NULL, - eps = 0.1, K = 200, seed = 0) { +morie_ghosal_posterior_consistency <- function(x, ref_loc = NULL, ref_scale = NULL, + eps = 0.1, K = 200, seed = 0) { set.seed(seed) - x <- as.numeric(x); n <- length(x) - if (n == 0) return(list(estimate = NA_real_, n = 0, - method = "Schwartz consistency (empty)")) + x <- as.numeric(x) + n <- length(x) + if (n == 0) { + return(list( + estimate = NA_real_, n = 0, + method = "Schwartz consistency (empty)" + )) + } xs <- sort(x) grid <- seq(xs[1] - 1, xs[n] + 1, length.out = 200) if (is.null(ref_loc) || is.null(ref_scale)) { - F_ref <- sapply(grid, function(t) sum(xs <= t)) / n + F_ref <- vapply(grid, function(t) sum(xs <= t), numeric(1)) / n } else { F_ref <- stats::pnorm(grid, ref_loc, ref_scale) } @@ -33,16 +35,19 @@ ghosal_posterior_consistency <- function(x, ref_loc = NULL, ref_scale = NULL, if (.gh_have("MCMCpack")) { u <- as.numeric(MCMCpack::rdirichlet(1, rep(1, n))) } else { - g <- stats::rgamma(n, shape = 1, rate = 1); u <- g / sum(g) + g <- stats::rgamma(n, shape = 1, rate = 1) + u <- g / sum(g) } - cdf <- cumsum(u) # already in sorted order since xs is sorted + cdf <- cumsum(u) # already in sorted order since xs is sorted idx <- findInterval(grid, xs) F_draw <- ifelse(idx == 0, 0, cdf[pmin(pmax(idx, 1L), n)]) ks[k] <- max(abs(F_draw - F_ref)) } - list(estimate = mean(ks > eps), ks_mean = mean(ks), - ks_se = sd(ks) / sqrt(K), - schwartz_bound = exp(-2 * n * eps^2), - n = n, eps = eps, - method = "Schwartz consistency (Bayesian-bootstrap proxy)") + list( + estimate = mean(ks > eps), ks_mean = mean(ks), + ks_se = sd(ks) / sqrt(K), + schwartz_bound = exp(-2 * n * eps^2), + n = n, eps = eps, + method = "Schwartz consistency (Bayesian-bootstrap proxy)" + ) } diff --git a/r-package/morie/R/ghcrt.R b/r-package/morie/R/ghcrt.R index c79ee1881d..0726142518 100644 --- a/r-package/morie/R/ghcrt.R +++ b/r-package/morie/R/ghcrt.R @@ -9,19 +9,22 @@ #' @param d Integer dimension (default 1). #' @return Named list with estimate, log_rate_correction, parametric_rate, n, beta, d, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_contraction_rate(x = rnorm(50)) #' @export -ghosal_contraction_rate <- function(x, beta = 1.0, d = 1) { +morie_ghosal_contraction_rate <- function(x, beta = 1.0, d = 1) { n <- length(x) - if (n <= 1) return(list(estimate = NA_real_, n = n, - method = "Contraction rate (n too small)")) + if (n <= 1) { + return(list( + estimate = NA_real_, n = n, + method = "Contraction rate (n too small)" + )) + } eps_n <- n^(-beta / (2 * beta + d)) - list(estimate = eps_n, - log_rate_correction = (log(n))^(beta/(2*beta+d)) * eps_n, - parametric_rate = n^(-0.5), - n = n, beta = beta, d = d, - method = "Minimax contraction rate n^{-beta/(2beta+d)}") + list( + estimate = eps_n, + log_rate_correction = (log(n))^(beta / (2 * beta + d)) * eps_n, + parametric_rate = n^(-0.5), + n = n, beta = beta, d = d, + method = "Minimax contraction rate n^{-beta/(2beta+d)}" + ) } diff --git a/r-package/morie/R/ghdir.R b/r-package/morie/R/ghdir.R index 2797906af3..3dda02fdbe 100644 --- a/r-package/morie/R/ghdir.R +++ b/r-package/morie/R/ghdir.R @@ -12,31 +12,32 @@ #' @return named list with `estimate`, `cdf_grid`, `cdf_post`, #' `cdf_var`, `alpha_post`, `n`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_dirichlet_posterior(x = rnorm(50)) #' @export -ghosal_dirichlet_posterior <- function(x, alpha = 1.0, base_mean = 0, - base_sd = 1, grid = NULL) { - x <- as.numeric(x); n <- length(x) +morie_ghosal_dirichlet_posterior <- function(x, alpha = 1.0, base_mean = 0, + base_sd = 1, grid = NULL) { + x <- as.numeric(x) + n <- length(x) if (is.null(grid)) { - if (n == 0) grid <- seq(base_mean - 3*base_sd, base_mean + 3*base_sd, length.out = 51) - else { + if (n == 0) { + grid <- seq(base_mean - 3 * base_sd, base_mean + 3 * base_sd, length.out = 51) + } else { pad <- max(1e-6, 0.1 * (max(x) - min(x) + 1)) grid <- seq(min(x) - pad, max(x) + pad, length.out = 51) } } alpha_post <- alpha + n G0_t <- stats::pnorm(grid, mean = base_mean, sd = base_sd) - emp_t <- if (n > 0) sapply(grid, function(t) sum(x <= t)) else rep(0, length(grid)) + emp_t <- if (n > 0) vapply(grid, function(t) sum(x <= t), numeric(1)) else rep(0, length(grid)) F_post <- (alpha * G0_t + emp_t) / alpha_post var_post <- F_post * (1 - F_post) / (alpha_post + 1) t0 <- if (n > 0) mean(x) else base_mean G0_t0 <- stats::pnorm(t0, mean = base_mean, sd = base_sd) emp_t0 <- if (n > 0) sum(x <= t0) else 0 est <- (alpha * G0_t0 + emp_t0) / alpha_post - list(estimate = est, alpha_post = alpha_post, n = n, - cdf_grid = grid, cdf_post = F_post, cdf_var = var_post, - method = "Dirichlet process posterior (conjugate)") + list( + estimate = est, alpha_post = alpha_post, n = n, + cdf_grid = grid, cdf_post = F_post, cdf_var = var_post, + method = "Dirichlet process posterior (conjugate)" + ) } diff --git a/r-package/morie/R/ghdpm.R b/r-package/morie/R/ghdpm.R index 608a52a908..294dc7556b 100644 --- a/r-package/morie/R/ghdpm.R +++ b/r-package/morie/R/ghdpm.R @@ -12,36 +12,41 @@ #' @return named list with `estimate`, `grid`, `density`, `k_post`, `n` #' @importFrom utils head tail #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_dpmixture_density(x = rnorm(50)) #' @export -ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, - grid = NULL, n_iter = 120, burn = 40, - seed = 0, deterministic_seed = NULL) { +morie_ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, + grid = NULL, n_iter = 120, burn = 40, + seed = 0, deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("ghdpm", deterministic_seed) } else { set.seed(seed) } - x <- as.numeric(x); n <- length(x) - if (n == 0) return(list(estimate = NA_real_, n = 0, - method = "DP-mixture density (empty input)")) + x <- as.numeric(x) + n <- length(x) + if (n == 0) { + return(list( + estimate = NA_real_, n = 0, + method = "DP-mixture density (empty input)" + )) + } if (is.null(sigma)) { s <- if (n > 1) sd(x) else 1 - sigma <- 1.06 * max(s, 1e-6) * n^(-1/5) + sigma <- 1.06 * max(s, 1e-6) * n^(-1 / 5) } sigma <- max(sigma, 1e-6) - m0 <- mean(x); s0 <- if (n > 1) sd(x) else 1 + m0 <- mean(x) + s0 <- if (n > 1) sd(x) else 1 s0 <- max(s0, 1e-3) if (is.null(grid)) grid <- seq(min(x) - s0, max(x) + s0, length.out = 51) labels <- rep(0L, n) - k_chain <- integer(0); f_chain <- list() + k_chain <- integer(0) + f_chain <- list() new_log <- function(xi) stats::dnorm(xi, mean = m0, sd = sqrt(sigma^2 + s0^2), log = TRUE) cluster_post <- function(xs) { - nk <- length(xs); v <- 1 / (1/s0^2 + nk/sigma^2) - m <- v * (m0/s0^2 + sum(xs)/sigma^2) + nk <- length(xs) + v <- 1 / (1 / s0^2 + nk / sigma^2) + m <- v * (m0 / s0^2 + sum(xs) / sigma^2) list(m = m, v = v) } in_log <- function(xi, xs) { @@ -50,7 +55,8 @@ ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, } for (it in seq_len(n_iter)) { for (i in seq_len(n)) { - old <- labels[i]; labels[i] <- -1L + old <- labels[i] + labels[i] <- -1L uniq <- sort(unique(labels[labels >= 0])) lps <- numeric(length(uniq) + 1L) for (j in seq_along(uniq)) { @@ -58,7 +64,9 @@ ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, lps[j] <- log(length(xs)) + in_log(x[i], xs) } lps[length(uniq) + 1L] <- log(alpha) + new_log(x[i]) - lps <- lps - max(lps); probs <- exp(lps); probs <- probs/sum(probs) + lps <- lps - max(lps) + probs <- exp(lps) + probs <- probs / sum(probs) choice <- sample.int(length(probs), 1, prob = probs) if (choice == length(uniq) + 1L) { labels[i] <- if (length(uniq)) max(uniq) + 1L else 0L @@ -70,7 +78,8 @@ ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, uniq <- sort(unique(labels)) f <- numeric(length(grid)) for (k in uniq) { - xs <- x[labels == k]; cp <- cluster_post(xs) + xs <- x[labels == k] + cp <- cluster_post(xs) f <- f + (length(xs) / (alpha + n)) * stats::dnorm(grid, mean = cp$m, sd = sqrt(cp$v + sigma^2)) } @@ -84,7 +93,9 @@ ghosal_dpmixture_density <- function(x, alpha = 1.0, sigma = NULL, num <- sum(diff(grid) * (head(density * grid, -1) + tail(density * grid, -1))) / 2 den <- sum(diff(grid) * (head(density, -1) + tail(density, -1))) / 2 est <- num / max(den, 1e-12) - list(estimate = est, grid = grid, density = density, - k_post = mean(k_chain), n = n, alpha = alpha, sigma = sigma, - method = "DP-mixture density via collapsed Gibbs (Neal 2000 Alg 3)") + list( + estimate = est, grid = grid, density = density, + k_post = mean(k_chain), n = n, alpha = alpha, sigma = sigma, + method = "DP-mixture density via collapsed Gibbs (Neal 2000 Alg 3)" + ) } diff --git a/r-package/morie/R/ghebp.R b/r-package/morie/R/ghebp.R index e81c0b1caf..509d5954c5 100644 --- a/r-package/morie/R/ghebp.R +++ b/r-package/morie/R/ghebp.R @@ -1,30 +1,44 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: Antoniak DP empirical-Bayes negative log-likelihood in the +# concentration parameter alpha. Extracted from the +# morie_ghosal_empirical_bayes() optimiser closure for direct unit-testing. +.ghebp_negll <- function(a, K_n, n) { + -(K_n * log(a) + lgamma(a) - lgamma(a + n)) +} + #' Empirical-Bayes alpha MLE for a DP, given the observed K_n. #' #' @param x Numeric data vector. #' @param alpha_grid Optional numeric grid of alpha values to maximise over. #' @return Named list with estimate (alpha-hat), K_n, log_lik_at_estimate, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_empirical_bayes(x = rnorm(50)) #' @export -ghosal_empirical_bayes <- function(x, alpha_grid = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2) return(list(estimate = NA_real_, n = n, - method = "Empirical Bayes (n<2)")) +morie_ghosal_empirical_bayes <- function(x, alpha_grid = NULL) { + x <- as.numeric(x) + n <- length(x) + if (n < 2) { + return(list( + estimate = NA_real_, n = n, + method = "Empirical Bayes (n<2)" + )) + } K_n <- length(unique(x)) if (K_n == n) K_n <- max(2, ceiling(log2(n) + 1)) - neg_ll <- function(a) -(K_n * log(a) + lgamma(a) - lgamma(a + n)) + neg_ll <- function(a) .ghebp_negll(a, K_n, n) if (is.null(alpha_grid)) { opt <- stats::optimize(neg_ll, interval = c(1e-3, 1e3)) - a_hat <- opt$minimum; ll <- -opt$objective + a_hat <- opt$minimum + ll <- -opt$objective } else { - ll_grid <- -sapply(alpha_grid, neg_ll) - idx <- which.max(ll_grid); a_hat <- alpha_grid[idx]; ll <- ll_grid[idx] + ll_grid <- -vapply(alpha_grid, neg_ll, numeric(1)) + idx <- which.max(ll_grid) + a_hat <- alpha_grid[idx] + ll <- ll_grid[idx] } - list(estimate = a_hat, K_n = K_n, log_lik_at_estimate = ll, n = n, - method = "Empirical-Bayes alpha for DP (Antoniak 1974 MLE)") + list( + estimate = a_hat, K_n = K_n, log_lik_at_estimate = ll, n = n, + method = "Empirical-Bayes alpha for DP (Antoniak 1974 MLE)" + ) } diff --git a/r-package/morie/R/ghgpm.R b/r-package/morie/R/ghgpm.R index 580bca0934..5db2f1ff46 100644 --- a/r-package/morie/R/ghgpm.R +++ b/r-package/morie/R/ghgpm.R @@ -11,15 +11,13 @@ #' @param x_star Optional matrix of prediction points (defaults to x). #' @return Named list with estimate, se, mu, sd, length_scale, nu, noise, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_gp_matern(x = rnorm(50), y = rnorm(50)) #' @export -ghosal_gp_matern <- function(x, y, nu = 1.5, length_scale = NULL, - sigma_f = 1.0, noise = NULL, x_star = NULL) { +morie_ghosal_gp_matern <- function(x, y, nu = 1.5, length_scale = NULL, + sigma_f = 1.0, noise = NULL, x_star = NULL) { if (is.vector(x)) x <- matrix(as.numeric(x), ncol = 1L) else x <- as.matrix(x) - y <- as.numeric(y); n <- nrow(x) + y <- as.numeric(y) + n <- nrow(x) if (is.null(x_star)) { x_star <- x } else if (is.vector(x_star)) { @@ -36,16 +34,19 @@ ghosal_gp_matern <- function(x, y, nu = 1.5, length_scale = NULL, kernel <- function(a, b) { sq_ab <- pmax(.gh_pairwise_sq(a, b), 0) r <- sqrt(sq_ab) - if (isTRUE(all.equal(nu, 0.5))) return(sigma_f^2 * exp(-r / length_scale)) + if (isTRUE(all.equal(nu, 0.5))) { + return(sigma_f^2 * exp(-r / length_scale)) + } if (isTRUE(all.equal(nu, 1.5))) { t <- sqrt(3) * r / length_scale return(sigma_f^2 * (1 + t) * exp(-t)) } if (isTRUE(all.equal(nu, 2.5))) { t <- sqrt(5) * r / length_scale - return(sigma_f^2 * (1 + t + t^2/3) * exp(-t)) + return(sigma_f^2 * (1 + t + t^2 / 3) * exp(-t)) } - rr <- pmax(r, 1e-12); z <- sqrt(2*nu) * rr / length_scale + rr <- pmax(r, 1e-12) + z <- sqrt(2 * nu) * rr / length_scale coef <- sigma_f^2 * 2^(1 - nu) / gamma(nu) K <- coef * z^nu * besselK(z, nu) K[r < 1e-12] <- sigma_f^2 @@ -60,7 +61,9 @@ ghosal_gp_matern <- function(x, y, nu = 1.5, length_scale = NULL, v <- forwardsolve(t(L), t(K_s)) var <- K_ss_diag - colSums(v^2) sd_ <- sqrt(pmax(var, 0)) - list(estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, - length_scale = length_scale, nu = nu, noise = noise, n = n, - method = "GP regression (Matern kernel)") + list( + estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, + length_scale = length_scale, nu = nu, noise = noise, n = n, + method = "GP regression (Matern kernel)" + ) } diff --git a/r-package/morie/R/ghgps.R b/r-package/morie/R/ghgps.R index 8a32007984..a2de4d6115 100644 --- a/r-package/morie/R/ghgps.R +++ b/r-package/morie/R/ghgps.R @@ -10,16 +10,14 @@ #' @param x_star Optional matrix of prediction points (defaults to x). #' @return Named list with estimate, se, mu, sd, length_scale, noise, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_gp_squared_exponential(x = rnorm(50), y = rnorm(50)) #' @export -ghosal_gp_squared_exponential <- function(x, y, length_scale = NULL, - sigma_f = 1.0, noise = NULL, - x_star = NULL) { +morie_ghosal_gp_squared_exponential <- function(x, y, length_scale = NULL, + sigma_f = 1.0, noise = NULL, + x_star = NULL) { if (is.vector(x)) x <- matrix(as.numeric(x), ncol = 1L) else x <- as.matrix(x) - y <- as.numeric(y); n <- nrow(x) + y <- as.numeric(y) + n <- nrow(x) if (is.null(x_star)) { x_star <- x } else if (is.vector(x_star)) { @@ -46,7 +44,9 @@ ghosal_gp_squared_exponential <- function(x, y, length_scale = NULL, v <- forwardsolve(t(L), t(K_s)) var <- K_ss_diag - colSums(v^2) sd_ <- sqrt(pmax(var, 0)) - list(estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, - length_scale = length_scale, noise = noise, n = n, - method = "GP regression (squared-exponential kernel)") + list( + estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, + length_scale = length_scale, noise = noise, n = n, + method = "GP regression (squared-exponential kernel)" + ) } diff --git a/r-package/morie/R/ghhbp.R b/r-package/morie/R/ghhbp.R index 5647f6a60b..96104bc94f 100644 --- a/r-package/morie/R/ghhbp.R +++ b/r-package/morie/R/ghhbp.R @@ -14,29 +14,34 @@ #' @return Named list with estimate (alpha post mean), alpha_se, #' alpha_draws, K_n, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_hierarchical_bayes(x = rnorm(50)) #' @export -ghosal_hierarchical_bayes <- function(x, a_prior = 1.0, b_prior = 1.0, - M = 400, seed = 0, - deterministic_seed = NULL) { +morie_ghosal_hierarchical_bayes <- function(x, a_prior = 1.0, b_prior = 1.0, + M = 400, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("ghhbp", deterministic_seed) } else { set.seed(seed) } - x <- as.numeric(x); n <- length(x) - if (n < 2) return(list(estimate = NA_real_, n = n, - method = "Hierarchical NP-Bayes (n<2)")) + x <- as.numeric(x) + n <- length(x) + if (n < 2) { + return(list( + estimate = NA_real_, n = n, + method = "Hierarchical NP-Bayes (n<2)" + )) + } K_n <- length(unique(x)) if (K_n == n) K_n <- max(2, ceiling(log2(n) + 1)) - a <- a_prior; b <- b_prior; alpha <- 1 + a <- a_prior + b <- b_prior + alpha <- 1 draws <- numeric(M) for (m in seq_len(M)) { eta <- stats::rbeta(1, alpha + 1, n) - w1 <- a + K_n - 1; w2 <- n * (b - log(eta)) + w1 <- a + K_n - 1 + w2 <- n * (b - log(eta)) p_eta <- w1 / (w1 + w2) if (stats::runif(1) < p_eta) { alpha <- stats::rgamma(1, shape = a + K_n, rate = b - log(eta)) @@ -45,8 +50,11 @@ ghosal_hierarchical_bayes <- function(x, a_prior = 1.0, b_prior = 1.0, } draws[m] <- alpha } - burn <- M %/% 4; chain <- draws[(burn + 1):M] - list(estimate = mean(chain), alpha_se = sd(chain), alpha_draws = chain, - K_n = K_n, n = n, - method = "Escobar-West augmentation for alpha | K_n") + burn <- M %/% 4 + chain <- draws[(burn + 1):M] + list( + estimate = mean(chain), alpha_se = sd(chain), alpha_draws = chain, + K_n = K_n, n = n, + method = "Escobar-West augmentation for alpha | K_n" + ) } diff --git a/r-package/morie/R/ghlgd.R b/r-package/morie/R/ghlgd.R index 14f14a78f4..c765ff53d8 100644 --- a/r-package/morie/R/ghlgd.R +++ b/r-package/morie/R/ghlgd.R @@ -1,5 +1,18 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: penalised log-spline density negative log-likelihood. +# Extracted from the morie_ghosal_log_density() optimiser closure for direct +# unit-testing. `Bx`/`Bg` are the data and grid basis matrices, `gz` the +# standardised evaluation grid, `n` the sample size. +.ghlgd_negll <- function(theta, Bx, Bg, gz, n) { + eta_x <- Bx %*% theta + eta_g <- Bg %*% theta + M <- max(eta_g) + Z <- M + log(sum(diff(gz) * (utils::head(exp(eta_g - M), -1) + + utils::tail(exp(eta_g - M), -1))) / 2) + -(sum(eta_x) - n * Z) + 1e-4 * sum(theta^2) +} + #' Log-spline density estimator (Stone 1990, Ghosal Ch 8). #' #' @param x Numeric data vector. @@ -8,37 +21,42 @@ #' @return Named list with estimate, theta, log_lik, grid, log_density, K, n, method. #' @importFrom utils head tail #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_log_density(x = rnorm(50)) #' @export -ghosal_log_density <- function(x, K = 5, grid = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 5) return(list(estimate = NA_real_, n = n, - method = "Log-density (n<5)")) - m <- mean(x); s <- max(sd(x), 1e-6); z <- (x - m) / s - if (is.null(grid)) gz <- seq(min(z) - 1, max(z) + 1, length.out = 401) - else gz <- (grid - m) / s - basis <- function(u) sapply(seq_len(K), function(k) u^k) - Bx <- basis(z); Bg <- basis(gz) - neg_ll <- function(theta) { - eta_x <- Bx %*% theta; eta_g <- Bg %*% theta - M <- max(eta_g) - Z <- M + log(sum(diff(gz) * (head(exp(eta_g - M), -1) + - tail(exp(eta_g - M), -1))) / 2) - -(sum(eta_x) - n * Z) + 1e-4 * sum(theta^2) +morie_ghosal_log_density <- function(x, K = 5, grid = NULL) { + x <- as.numeric(x) + n <- length(x) + if (n < 5) { + return(list( + estimate = NA_real_, n = n, + method = "Log-density (n<5)" + )) + } + m <- mean(x) + s <- max(sd(x), 1e-6) + z <- (x - m) / s + if (is.null(grid)) { + gz <- seq(min(z) - 1, max(z) + 1, length.out = 401) + } else { + gz <- (grid - m) / s } + basis <- function(u) vapply(seq_len(K), function(k) u^k, numeric(length(u))) + Bx <- basis(z) + Bg <- basis(gz) + neg_ll <- function(theta) .ghlgd_negll(theta, Bx, Bg, gz, n) opt <- stats::optim(rep(0, K), neg_ll, method = "BFGS") theta <- opt$par - eta_g <- Bg %*% theta; M <- max(eta_g) + eta_g <- Bg %*% theta + M <- max(eta_g) logZ <- M + log(sum(diff(gz) * (head(exp(eta_g - M), -1) + - tail(exp(eta_g - M), -1))) / 2) + tail(exp(eta_g - M), -1))) / 2) log_density <- as.numeric(eta_g - logZ - log(s)) eta0 <- as.numeric(basis(0) %*% theta) est <- eta0 - logZ - log(s) - list(estimate = est, theta = theta, - log_lik = -(opt$value - 1e-4 * sum(theta^2)), - grid = gz * s + m, log_density = log_density, K = K, n = n, - method = "Log-spline density (Stone 1990)") + list( + estimate = est, theta = theta, + log_lik = -(opt$value - 1e-4 * sum(theta^2)), + grid = gz * s + m, log_density = log_density, K = K, n = n, + method = "Log-spline density (Stone 1990)" + ) } diff --git a/r-package/morie/R/ghmmt.R b/r-package/morie/R/ghmmt.R index 12afb2359e..d3eb3b113d 100644 --- a/r-package/morie/R/ghmmt.R +++ b/r-package/morie/R/ghmmt.R @@ -10,26 +10,26 @@ #' @param base_sd Numeric base-measure sd (default 1). #' @return Named list with estimate, se, prior_mean, prior_var, n_A, n, alpha, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_moment_matching(x = rnorm(50)) #' @export -ghosal_moment_matching <- function(x, alpha = 1.0, A_lower = NULL, - A_upper = NULL, base_mean = 0, - base_sd = 1) { - x <- as.numeric(x); n <- length(x) +morie_ghosal_moment_matching <- function(x, alpha = 1.0, A_lower = NULL, + A_upper = NULL, base_mean = 0, + base_sd = 1) { + x <- as.numeric(x) + n <- length(x) if (is.null(A_lower)) A_lower <- -Inf if (is.null(A_upper)) A_upper <- if (n) mean(x) else 0 G0_A <- max(0, min(1, stats::pnorm(A_upper, base_mean, base_sd) - - stats::pnorm(A_lower, base_mean, base_sd))) + - stats::pnorm(A_lower, base_mean, base_sd))) prior_mean <- G0_A prior_var <- G0_A * (1 - G0_A) / (alpha + 1) n_A <- if (n) sum(x > A_lower & x <= A_upper) else 0L post_mean <- (alpha * G0_A + n_A) / (alpha + n) - post_var <- post_mean * (1 - post_mean) / (alpha + n + 1) - list(estimate = post_mean, se = sqrt(max(post_var, 0)), - prior_mean = prior_mean, prior_var = prior_var, - n_A = as.integer(n_A), n = n, alpha = alpha, - method = "DP moment-matching (Ferguson 1973)") + post_var <- post_mean * (1 - post_mean) / (alpha + n + 1) + list( + estimate = post_mean, se = sqrt(max(post_var, 0)), + prior_mean = prior_mean, prior_var = prior_var, + n_A = as.integer(n_A), n = n, alpha = alpha, + method = "DP moment-matching (Ferguson 1973)" + ) } diff --git a/r-package/morie/R/ghntr.R b/r-package/morie/R/ghntr.R index 20570294c4..8558e0c3ee 100644 --- a/r-package/morie/R/ghntr.R +++ b/r-package/morie/R/ghntr.R @@ -8,19 +8,22 @@ #' @param lam0 Optional baseline hazard rate. #' @return Named list with estimate, times, S_post, H_post, c, lam0, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_neutral_right(time = cumsum(rexp(50))) #' @export -ghosal_neutral_right <- function(time, event = NULL, c = 1.0, lam0 = NULL) { +morie_ghosal_neutral_right <- function(time, event = NULL, c = 1.0, lam0 = NULL) { s <- .gh_surv_post(time, event, c, lam0) - if (is.null(s)) return(list(estimate = NA_real_, n = 0, - method = "NTR process (empty)")) + if (is.null(s)) { + return(list( + estimate = NA_real_, n = 0, + method = "NTR process (empty)" + )) + } t_med <- stats::median(time) idx <- findInterval(t_med, s$times) est <- if (idx >= 1) s$S[idx] else 1 - list(estimate = est, times = s$times, S_post = s$S, H_post = s$H, - c = c, lam0 = s$lam0, n = length(time), - method = "Neutral-to-the-right posterior (Doksum 1974)") + list( + estimate = est, times = s$times, S_post = s$S, H_post = s$H, + c = c, lam0 = s$lam0, n = length(time), + method = "Neutral-to-the-right posterior (Doksum 1974)" + ) } diff --git a/r-package/morie/R/ghreg.R b/r-package/morie/R/ghreg.R index 6105d54acc..6545c948f4 100644 --- a/r-package/morie/R/ghreg.R +++ b/r-package/morie/R/ghreg.R @@ -2,7 +2,7 @@ #' GP nonparametric regression #' -#' Wraps \code{ghosal_gp_squared_exponential}. +#' Wraps \code{morie_ghosal_gp_squared_exponential}. #' #' @param x Numeric vector or matrix of input points. #' @param y Numeric response vector. @@ -12,24 +12,28 @@ #' @return Named list with estimate, se, mu, sd, ci_lower, ci_upper, r2, #' log_marginal, length_scale, noise, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_np_regression(x = rnorm(50), y = rnorm(50)) #' @export -ghosal_np_regression <- function(x, y, length_scale = NULL, - sigma_f = 1.0, noise = NULL) { - gp <- ghosal_gp_squared_exponential(x, y, length_scale = length_scale, - sigma_f = sigma_f, noise = noise) - yv <- as.numeric(y); mu <- gp$mu; sd_ <- gp$sd - ss_tot <- sum((yv - mean(yv))^2); ss_res <- sum((yv - mu)^2) +morie_ghosal_np_regression <- function(x, y, length_scale = NULL, + sigma_f = 1.0, noise = NULL) { + gp <- morie_ghosal_gp_squared_exponential(x, y, + length_scale = length_scale, + sigma_f = sigma_f, noise = noise + ) + yv <- as.numeric(y) + mu <- gp$mu + sd_ <- gp$sd + ss_tot <- sum((yv - mean(yv))^2) + ss_res <- sum((yv - mu)^2) r2 <- 1 - ss_res / max(ss_tot, 1e-12) var_pred <- sd_^2 + gp$noise^2 - log_marg <- -0.5 * sum((yv - mu)^2 / var_pred + log(2*pi*var_pred)) - list(estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, - ci_lower = mu - 1.96 * sqrt(var_pred), - ci_upper = mu + 1.96 * sqrt(var_pred), - r2 = r2, log_marginal = log_marg, - length_scale = gp$length_scale, noise = gp$noise, n = length(yv), - method = "GP regression posterior") + log_marg <- -0.5 * sum((yv - mu)^2 / var_pred + log(2 * pi * var_pred)) + list( + estimate = mean(mu), se = mean(sd_), mu = mu, sd = sd_, + ci_lower = mu - 1.96 * sqrt(var_pred), + ci_upper = mu + 1.96 * sqrt(var_pred), + r2 = r2, log_marginal = log_marg, + length_scale = gp$length_scale, noise = gp$noise, n = length(yv), + method = "GP regression posterior" + ) } diff --git a/r-package/morie/R/ghsrv.R b/r-package/morie/R/ghsrv.R index 677fd2eaf1..6c52f2d101 100644 --- a/r-package/morie/R/ghsrv.R +++ b/r-package/morie/R/ghsrv.R @@ -8,20 +8,23 @@ #' @param lam0 Optional baseline hazard rate. #' @return Named list with estimate, times, S_post, H_post, c, lam0, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_survival_beta_process(time = cumsum(rexp(50))) #' @export -ghosal_survival_beta_process <- function(time, event = NULL, c = 1.0, - lam0 = NULL) { +morie_ghosal_survival_beta_process <- function(time, event = NULL, c = 1.0, + lam0 = NULL) { s <- .gh_surv_post(time, event, c, lam0) - if (is.null(s)) return(list(estimate = NA_real_, n = 0, - method = "Beta-process survival (empty)")) + if (is.null(s)) { + return(list( + estimate = NA_real_, n = 0, + method = "Beta-process survival (empty)" + )) + } t_med <- stats::median(time) idx <- findInterval(t_med, s$times) est <- if (idx >= 1) s$S[idx] else 1 - list(estimate = est, times = s$times, S_post = s$S, H_post = s$H, - c = c, lam0 = s$lam0, n = length(time), - method = "Beta-process posterior survival (Hjort 1990)") + list( + estimate = est, times = s$times, S_post = s$S, H_post = s$H, + c = c, lam0 = s$lam0, n = length(time), + method = "Beta-process posterior survival (Hjort 1990)" + ) } diff --git a/r-package/morie/R/ghstk.R b/r-package/morie/R/ghstk.R index 86c79ef6c8..8780a2cad5 100644 --- a/r-package/morie/R/ghstk.R +++ b/r-package/morie/R/ghstk.R @@ -15,22 +15,20 @@ #' @return Named list with estimate, weights, atoms, effective_K, #' trunc_err_bound, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_stick_breaking_trunc(x = rnorm(50)) #' @export -ghosal_stick_breaking_trunc <- function(x, alpha = 1.0, K = 50, seed = 0, - base_mean = NULL, base_sd = NULL, - deterministic_seed = NULL) { +morie_ghosal_stick_breaking_trunc <- function(x, alpha = 1.0, K = 50, seed = 0, + base_mean = NULL, base_sd = NULL, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("ghstk", deterministic_seed) } else { set.seed(seed) } - x <- as.numeric(x); n <- length(x) + x <- as.numeric(x) + n <- length(x) if (is.null(base_mean)) base_mean <- if (n) mean(x) else 0 - if (is.null(base_sd)) base_sd <- if (n > 1) sd(x) else 1 + if (is.null(base_sd)) base_sd <- if (n > 1) sd(x) else 1 base_sd <- max(base_sd, 1e-6) V <- stats::rbeta(K, 1, alpha) log_cum <- c(0, cumsum(log1p(-V[-K]))) @@ -39,8 +37,10 @@ ghosal_stick_breaking_trunc <- function(x, alpha = 1.0, K = 50, seed = 0, t0 <- if (n) mean(x) else base_mean est <- sum(w * (theta <= t0)) trunc_bound <- (alpha / (alpha + 1))^K - list(estimate = est, weights = w, atoms = theta, - effective_K = sum(w > 1e-3), - trunc_err_bound = trunc_bound, n = n, - method = "Truncated stick-breaking DP draw (Sethuraman 1994)") + list( + estimate = est, weights = w, atoms = theta, + effective_K = sum(w > 1e-3), + trunc_err_bound = trunc_bound, n = n, + method = "Truncated stick-breaking DP draw (Sethuraman 1994)" + ) } diff --git a/r-package/morie/R/ghsve.R b/r-package/morie/R/ghsve.R index b84ca541e2..94f07a077b 100644 --- a/r-package/morie/R/ghsve.R +++ b/r-package/morie/R/ghsve.R @@ -6,32 +6,41 @@ #' @param K Optional integer sieve degree (default round(n^(1/3))). #' @return Named list with estimate, log_lik_per_obs, weights, K, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_sieve_prior(x = rnorm(50)) #' @export -ghosal_sieve_prior <- function(x, K = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 3) return(list(estimate = NA_real_, n = n, - method = "Bernstein sieve (n<3)")) - lo <- min(x) - 1e-6; hi <- max(x) + 1e-6 +morie_ghosal_sieve_prior <- function(x, K = NULL) { + x <- as.numeric(x) + n <- length(x) + if (n < 3) { + return(list( + estimate = NA_real_, n = n, + method = "Bernstein sieve (n<3)" + )) + } + lo <- min(x) - 1e-6 + hi <- max(x) + 1e-6 u <- (x - lo) / (hi - lo) - if (is.null(K)) K <- max(2, round(n^(1/3))) + if (is.null(K)) K <- max(2, round(n^(1 / 3))) B <- .gh_bernstein(u, K) - w <- rep(1/K, K) + w <- rep(1 / K, K) for (it in seq_len(60)) { num <- sweep(B, 2, w, "*") denom <- pmax(rowSums(num), 1e-12) gamma <- num / denom - w_new <- colMeans(gamma); w_new <- w_new/sum(w_new) - if (max(abs(w_new - w)) < 1e-8) { w <- w_new; break } + w_new <- colMeans(gamma) + w_new <- w_new / sum(w_new) + if (max(abs(w_new - w)) < 1e-8) { + w <- w_new + break + } w <- w_new } log_lik <- mean(log(pmax(B %*% w, 1e-12))) u_bar <- (mean(x) - lo) / (hi - lo) B_bar <- .gh_bernstein(u_bar, K) f_bar <- as.numeric((B_bar %*% w) / (hi - lo)) - list(estimate = f_bar, log_lik_per_obs = log_lik, weights = w, K = K, n = n, - method = "Bernstein-polynomial sieve density (Petrone 1999, Ghosal 2001)") + list( + estimate = f_bar, log_lik_per_obs = log_lik, weights = w, K = K, n = n, + method = "Bernstein-polynomial sieve density (Petrone 1999, Ghosal 2001)" + ) } diff --git a/r-package/morie/R/ghtst.R b/r-package/morie/R/ghtst.R index 1a5f9f3f2a..f48a59915d 100644 --- a/r-package/morie/R/ghtst.R +++ b/r-package/morie/R/ghtst.R @@ -9,16 +9,18 @@ #' @param c Numeric Polya-tree concentration (default 1). #' @return Named list with statistic (log BF), p_value, BF10, log_BF10, n, depth, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_np_testing(x = rnorm(50)) #' @export -ghosal_np_testing <- function(x, ref_loc = 0, ref_scale = 1, depth = 6, - c = 1.0) { - x <- as.numeric(x); n <- length(x) - if (n < 2) return(list(statistic = NA_real_, p_value = NA_real_, n = n, - method = "Polya-tree BF (n<2)")) +morie_ghosal_np_testing <- function(x, ref_loc = 0, ref_scale = 1, depth = 6, + c = 1.0) { + x <- as.numeric(x) + n <- length(x) + if (n < 2) { + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + method = "Polya-tree BF (n<2)" + )) + } u <- stats::pnorm(x, mean = ref_loc, sd = ref_scale) log_bf <- 0 for (m in seq_len(depth)) { @@ -28,12 +30,15 @@ ghosal_np_testing <- function(x, ref_loc = 0, ref_scale = 1, depth = 6, bin <- pmin(pmax(bin, 1L), nbins) counts <- tabulate(bin, nbins = nbins) alpha <- c * m * m - n0 <- counts[seq(1, nbins, by = 2)]; n1 <- counts[seq(2, nbins, by = 2)] + n0 <- counts[seq(1, nbins, by = 2)] + n1 <- counts[seq(2, nbins, by = 2)] log_bf <- log_bf + sum(lbeta(alpha + n0, alpha + n1) - lbeta(alpha, alpha)) } BF10 <- exp(log_bf) p_value <- if (BF10 > 1) 1 / (1 + BF10) else 0.5 - list(statistic = log_bf, p_value = p_value, BF10 = BF10, - log_BF10 = log_bf, n = n, depth = depth, - method = "Polya-tree Bayes-factor test (Berger-Guglielmi)") + list( + statistic = log_bf, p_value = p_value, BF10 = BF10, + log_BF10 = log_bf, n = n, depth = depth, + method = "Polya-tree Bayes-factor test (Berger-Guglielmi)" + ) } diff --git a/r-package/morie/R/ghwav.R b/r-package/morie/R/ghwav.R index c62ece72e1..61d039a059 100644 --- a/r-package/morie/R/ghwav.R +++ b/r-package/morie/R/ghwav.R @@ -8,16 +8,20 @@ #' @param noise Optional noise sd. #' @return Named list with estimate, fitted, noise, sigma, inclusion, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ghosal_wavelet_prior(x = rnorm(50)) #' @export -ghosal_wavelet_prior <- function(x, pi = 0.5, sigma = NULL, noise = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 4) return(list(estimate = if (n) mean(x) else NA_real_, - fitted = x, n = n, method = "Wavelet prior (n<4)")) - dw <- .gh_haar_dwt(x); coeffs <- dw$coeffs; L <- dw$L +morie_ghosal_wavelet_prior <- function(x, pi = 0.5, sigma = NULL, noise = NULL) { + x <- as.numeric(x) + n <- length(x) + if (n < 4) { + return(list( + estimate = if (n) mean(x) else NA_real_, + fitted = x, n = n, method = "Wavelet prior (n<4)" + )) + } + dw <- .gh_haar_dwt(x) + coeffs <- dw$coeffs + L <- dw$L finest <- coeffs[[1]] if (is.null(noise)) noise <- max(stats::mad(finest) / 0.6745, 1e-6) if (is.null(sigma)) { @@ -25,13 +29,15 @@ ghosal_wavelet_prior <- function(x, pi = 0.5, sigma = NULL, noise = NULL) { sigma <- sqrt(max(var(all_d) - noise^2, 1e-6)) } sigma <- max(sigma, 1e-6) - incl <- c(); new_coeffs <- list() + incl <- c() + new_coeffs <- list() for (i in seq_along(coeffs[-length(coeffs)])) { d <- coeffs[[i]] var_slab <- sigma^2 + noise^2 - log_slab <- stats::dnorm(d, 0, sqrt(var_slab), log = TRUE) + log_slab <- stats::dnorm(d, 0, sqrt(var_slab), log = TRUE) log_spike <- stats::dnorm(d, 0, noise, log = TRUE) - a <- log(pi) + log_slab; b <- log(1 - pi) + log_spike + a <- log(pi) + log_slab + b <- log(1 - pi) + log_spike mm <- pmax(a, b) w <- exp(a - mm) / (exp(a - mm) + exp(b - mm)) shrink <- sigma^2 / var_slab @@ -49,7 +55,9 @@ ghosal_wavelet_prior <- function(x, pi = 0.5, sigma = NULL, noise = NULL) { cur <- out } fitted <- cur[seq_len(n)] - list(estimate = mean(fitted), fitted = fitted, noise = noise, - sigma = sigma, inclusion = mean(incl), n = n, - method = "Haar-wavelet spike-and-slab BayesThresh") + list( + estimate = mean(fitted), fitted = fitted, noise = noise, + sigma = sigma, inclusion = mean(incl), n = n, + method = "Haar-wavelet spike-and-slab BayesThresh" + ) } diff --git a/r-package/morie/R/gmatv.R b/r-package/morie/R/gmatv.R index a5ac589d79..f483cad2f6 100644 --- a/r-package/morie/R/gmatv.R +++ b/r-package/morie/R/gmatv.R @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# `grm_vanraden()` was moved to R/_helpers_montesinos.R so that gblpf.R, +# `morie_grm_vanraden()` was moved to R/_helpers_montesinos.R so that gblpf.R, # mtgbl.R, and mrkvr.R can rely on it being defined regardless of R's # alphabetical source order. See DESCRIPTION Collate: for the explicit # load order. diff --git a/r-package/morie/R/gpfit.R b/r-package/morie/R/gpfit.R index 781de5ff4c..f04f6a4ea1 100644 --- a/r-package/morie/R/gpfit.R +++ b/r-package/morie/R/gpfit.R @@ -1,4 +1,23 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: generalised-Pareto per-exceedance log-density. Lifted from +# the gpfit() optimiser closure so the xi ~ 0 (exponential) and +# out-of-support branches are directly unit-testable. +.gpfit_log_gp <- function(par, y) { + sigma <- exp(par[1]) + xi <- par[2] + if (abs(xi) < 1e-8) { + ll <- -log(sigma) - y / sigma + } else { + arg <- 1 + xi * y / sigma + if (any(arg <= 0)) { + return(rep(-1e10, length(y))) + } + ll <- -log(sigma) - (1 + 1 / xi) * log(arg) + } + ll +} + #' Generalised Pareto fit (POT) by ML (Pickands 1975) #' #' Fits F(x) = 1 - (1 + xi x/sigma)^(-1/xi) to threshold exceedances. @@ -10,40 +29,38 @@ #' @keywords internal gpfit <- function(x, threshold = NULL) { x <- as.numeric(x) - if (length(x) < 5L) + if (length(x) < 5L) { return(list(estimate = NA_real_, n = length(x), method = "GP (n<5)")) + } if (is.null(threshold)) threshold <- stats::quantile(x, 0.90, names = FALSE) excess <- x[x > threshold] - threshold n <- length(excess) - if (n < 5L) - return(list(estimate = NA_real_, n = n, - method = "GP (too few exceedances)")) - log_gp <- function(par, y) { - sigma <- exp(par[1]); xi <- par[2] - if (abs(xi) < 1e-8) { - ll <- -log(sigma) - y / sigma - } else { - arg <- 1 + xi * y / sigma - if (any(arg <= 0)) return(rep(-1e10, length(y))) - ll <- -log(sigma) - (1 + 1/xi) * log(arg) - } - ll + if (n < 5L) { + return(list( + estimate = NA_real_, n = n, + method = "GP (too few exceedances)" + )) } - nll <- function(par) -sum(log_gp(par, excess)) + nll <- function(par) -sum(.gpfit_log_gp(par, excess)) fit <- stats::optim(c(log(stats::sd(excess)), 0.1), nll, - method = "BFGS", hessian = TRUE) - sigma <- exp(fit$par[1]); xi <- fit$par[2] + method = "BFGS", hessian = TRUE + ) + sigma <- exp(fit$par[1]) + xi <- fit$par[2] loglik <- -fit$value J <- diag(c(sigma, 1)) cov_mat <- tryCatch(J %*% solve(fit$hessian) %*% t(J), - error = function(e) matrix(NA, 2, 2)) + error = function(e) matrix(NA, 2, 2) + ) ses <- sqrt(pmax(diag(cov_mat), 0)) - list(scale = as.numeric(sigma), shape = as.numeric(xi), - threshold = as.numeric(threshold), n_exceedances = as.integer(n), - se_sigma = as.numeric(ses[1]), se_xi = as.numeric(ses[2]), - loglik = as.numeric(loglik), - estimate = as.numeric(sigma), se = as.numeric(ses[1]), - method = "GP MLE (Pickands 1975)") + list( + scale = as.numeric(sigma), shape = as.numeric(xi), + threshold = as.numeric(threshold), n_exceedances = as.integer(n), + se_sigma = as.numeric(ses[1]), se_xi = as.numeric(ses[2]), + loglik = as.numeric(loglik), + estimate = as.numeric(sigma), se = as.numeric(ses[1]), + method = "GP MLE (Pickands 1975)" + ) } # CANONICAL TEST @@ -54,4 +71,4 @@ gpfit <- function(x, threshold = NULL) { #' @rdname gpfit #' @keywords internal #' @export -generalized_pareto <- gpfit +morie_generalized_pareto <- gpfit diff --git a/r-package/morie/R/grdcl.R b/r-package/morie/R/grdcl.R index 3702b15bda..909b84fe1b 100644 --- a/r-package/morie/R/grdcl.R +++ b/r-package/morie/R/grdcl.R @@ -8,13 +8,21 @@ #' @keywords internal gradient_clipping <- function(x, max_norm = 1) { is_list <- is.list(x) - cat_vec <- if (is_list) unlist(lapply(x, as.numeric)) - else as.numeric(x) + cat_vec <- if (is_list) { + unlist(lapply(x, as.numeric)) + } else { + as.numeric(x) + } total <- sqrt(sum(cat_vec * cat_vec)) coef <- min(1, max_norm / (total + 1e-12)) - clipped <- if (is_list) lapply(x, function(g) as.numeric(g) * coef) - else as.numeric(x) * coef - list(tensor = clipped, clip_coef = coef, - total_norm = total, max_norm = max_norm, - method = "global-norm-clip") + clipped <- if (is_list) { + lapply(x, function(g) as.numeric(g) * coef) + } else { + as.numeric(x) * coef + } + list( + tensor = clipped, clip_coef = coef, + total_norm = total, max_norm = max_norm, + method = "global-norm-clip" + ) } diff --git a/r-package/morie/R/grdds.R b/r-package/morie/R/grdds.R index 251073d10d..4dc3748603 100644 --- a/r-package/morie/R/grdds.R +++ b/r-package/morie/R/grdds.R @@ -13,15 +13,14 @@ #' @return Named list with \code{estimate}, \code{reference_ols}, #' \code{n_iter}, \code{loss}, \code{n}, \code{method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_gradient_descent_vanilla(x = rnorm(50), y = rnorm(50)) #' @export -gradient_descent_vanilla <- function(x, y, lr = 0.01, n_iter = 1000, tol = 1e-8) { +morie_gradient_descent_vanilla <- function(x, y, lr = 0.01, n_iter = 1000, tol = 1e-8) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) - n <- nrow(x); p <- ncol(x) + x <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(x) + p <- ncol(x) X1 <- cbind(1, x) theta <- rep(0.0, p + 1) it <- 0L diff --git a/r-package/morie/R/grpqa.R b/r-package/morie/R/grpqa.R index 535d51115f..51aa5f5bec 100644 --- a/r-package/morie/R/grpqa.R +++ b/r-package/morie/R/grpqa.R @@ -13,8 +13,9 @@ grouped_query_attention <- function(Q, K = NULL, V = NULL, n_heads = 8L, n_kv_heads = 2L) { if (is.null(K)) K <- Q if (is.null(V)) V <- Q - if (n_heads %% n_kv_heads != 0L) + if (n_heads %% n_kv_heads != 0L) { stop("n_heads must be a multiple of n_kv_heads") + } group <- n_heads %/% n_kv_heads # Ensure (n_heads, seq, d) and (n_kv_heads, seq, d) shapes. if (length(dim(Q)) == 2L) Q <- array(Q, dim = c(n_heads, dim(Q))) @@ -23,16 +24,18 @@ grouped_query_attention <- function(Q, K = NULL, V = NULL, # Replicate KV across the group dimension. rep_axis0 <- function(A, g) { new <- array(0, dim = c(dim(A)[1L] * g, dim(A)[-1L])) - for (i in seq_len(dim(A)[1L])) - for (j in seq_len(g)) + for (i in seq_len(dim(A)[1L])) { + for (j in seq_len(g)) { new[(i - 1L) * g + j, , ] <- A[i, , ] + } + } new } K_rep <- rep_axis0(K, group) V_rep <- rep_axis0(V, group) d_head <- dim(Q)[3L] attn <- array(0, dim = c(n_heads, dim(Q)[2L], dim(Q)[2L])) - out <- array(0, dim = dim(Q)) + out <- array(0, dim = dim(Q)) scale <- 1 / sqrt(d_head) for (h in seq_len(n_heads)) { Qh <- matrix(Q[h, , ], nrow = dim(Q)[2L], ncol = d_head) @@ -45,6 +48,8 @@ grouped_query_attention <- function(Q, K = NULL, V = NULL, attn[h, , ] <- a out[h, , ] <- a %*% Vh } - list(tensor = out, attn = attn, n_heads = n_heads, - n_kv_heads = n_kv_heads, group_size = group, method = "GQA") + list( + tensor = out, attn = attn, n_heads = n_heads, + n_kv_heads = n_kv_heads, group_size = group, method = "GQA" + ) } diff --git a/r-package/morie/R/grucl.R b/r-package/morie/R/grucl.R index 701386eb3c..0560a4fcc9 100644 --- a/r-package/morie/R/grucl.R +++ b/r-package/morie/R/grucl.R @@ -20,19 +20,21 @@ #' @return Named list \code{(h, estimate, z, r, n, method)}. #' @references Cho et al. (2014), EMNLP. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_grucl_gru_cell(x = rnorm(50)) #' @export -grucl_gru_cell <- function(x, h_prev = NULL, W = NULL, U = NULL, b = NULL, +morie_grucl_gru_cell <- function(x, h_prev = NULL, W = NULL, U = NULL, b = NULL, hidden_size = NULL, seed = 0L, deterministic_seed = NULL) { - x <- as.numeric(x); n_in <- length(x) + x <- as.numeric(x) + n_in <- length(x) if (is.null(hidden_size)) { - hidden_size <- if (!is.null(h_prev)) length(h_prev) - else if (!is.null(W)) nrow(as.matrix(W)) %/% 3L - else n_in + hidden_size <- if (!is.null(h_prev)) { + length(h_prev) + } else if (!is.null(W)) { + nrow(as.matrix(W)) %/% 3L + } else { + n_in + } } H <- as.integer(hidden_size) if (is.null(h_prev)) h_prev <- rep(0, H) @@ -45,19 +47,25 @@ grucl_gru_cell <- function(x, h_prev = NULL, W = NULL, U = NULL, b = NULL, if (is.null(U)) U <- matrix(stats::rnorm(3 * H * H, 0, 0.1), 3 * H, H) if (is.null(b)) b <- rep(0, 3 * H) pre <- as.numeric(W %*% x + b) - Wz <- pre[1:H]; Wr <- pre[(H + 1L):(2 * H)]; Wn <- pre[(2 * H + 1L):(3 * H)] + Wz <- pre[1:H] + Wr <- pre[(H + 1L):(2 * H)] + Wn <- pre[(2 * H + 1L):(3 * H)] Uh <- as.numeric(U %*% h_prev) - Uz <- Uh[1:H]; Ur <- Uh[(H + 1L):(2 * H)]; Un <- Uh[(2 * H + 1L):(3 * H)] + Uz <- Uh[1:H] + Ur <- Uh[(H + 1L):(2 * H)] + Un <- Uh[(2 * H + 1L):(3 * H)] sig <- function(z) 1 / (1 + exp(-z)) z <- sig(Wz + Uz) r <- sig(Wr + Ur) n <- tanh(Wn + r * Un) h <- (1 - z) * n + z * h_prev - list(h = h, estimate = h, z = z, r = r, n = n, - method = "GRU cell forward") + list( + h = h, estimate = h, z = z, r = r, n = n, + method = "GRU cell forward" + ) } -#' @rdname grucl_gru_cell +#' @rdname morie_grucl_gru_cell #' @keywords internal #' @export -gru_cell <- grucl_gru_cell +morie_gru_cell <- morie_grucl_gru_cell diff --git a/r-package/morie/R/gsrch.R b/r-package/morie/R/gsrch.R index 34a4572457..38d8e471f3 100644 --- a/r-package/morie/R/gsrch.R +++ b/r-package/morie/R/gsrch.R @@ -15,41 +15,51 @@ #' @return Named list: estimate (best CV score), best_params, best_score, #' cv_results_params, cv_results_mean_score, task, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_grid_search_cv( +#' x = matrix(rnorm(150), 50, 3), y = rnorm(50), +#' method = "lm", tune_grid = data.frame(intercept = c(TRUE, FALSE)), +#' cv = 3L, task = "regression", seed = 1L +#' ) #' @export -grid_search_cv <- function(x, y, method = NULL, tune_grid = NULL, - cv = 5L, task = "auto", seed = 0L) { +morie_grid_search_cv <- function(x, y, method = NULL, tune_grid = NULL, + cv = 5L, task = "auto", seed = 0L) { + x <- .morie_ensure_design_matrix(x) if (!requireNamespace("caret", quietly = TRUE)) { - stop("Function 'grid_search_cv' requires package 'caret'. Install with install.packages('caret').") + stop("Function 'morie_grid_search_cv' requires package 'caret'. Install with install.packages('caret').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); colnames(x) <- colnames(x) %||% paste0("x", seq_len(ncol(x)) - 1L) + x <- as.matrix(x) + colnames(x) <- colnames(x) %||% paste0("x", seq_len(ncol(x)) - 1L) if (identical(task, "auto")) { - task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) - "classification" else "regression" + task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) { + "classification" + } else { + "regression" + } } set.seed(seed) ctrl <- caret::trainControl(method = "cv", number = cv, classProbs = FALSE) if (is.null(method)) { if (task == "classification") { method <- "glmnet" - if (is.null(tune_grid)) + if (is.null(tune_grid)) { tune_grid <- expand.grid(alpha = 1, lambda = c(0.01, 0.1, 1.0, 10.0)) + } y_use <- factor(make.names(as.character(y))) } else { method <- "ridge" - if (is.null(tune_grid)) + if (is.null(tune_grid)) { tune_grid <- expand.grid(lambda = c(0.01, 0.1, 1.0, 10.0)) + } y_use <- as.numeric(y) } } else { y_use <- if (task == "classification") factor(make.names(as.character(y))) else as.numeric(y) } - fit <- caret::train(x = x, y = y_use, method = method, - tuneGrid = tune_grid, trControl = ctrl) + fit <- caret::train( + x = x, y = y_use, method = method, + tuneGrid = tune_grid, trControl = ctrl + ) best <- fit$bestTune results <- fit$results metric <- if (task == "classification") "Accuracy" else "RMSE" @@ -58,7 +68,11 @@ grid_search_cv <- function(x, y, method = NULL, tune_grid = NULL, estimate = as.numeric(max(scores, na.rm = TRUE)), best_params = as.list(best), best_score = as.numeric(max(scores, na.rm = TRUE)), - cv_results_params = results[, setdiff(colnames(results), c("Accuracy", "Kappa", "RMSE", "Rsquared", "MAE")), drop = FALSE], + cv_results_params = results[ + , setdiff(colnames(results), + c("Accuracy", "Kappa", "RMSE", "Rsquared", "MAE")), + drop = FALSE + ], cv_results_mean_score = scores, task = task, n = nrow(x), diff --git a/r-package/morie/R/gwreg.R b/r-package/morie/R/gwreg.R index 7a953b89c9..575a1366df 100644 --- a/r-package/morie/R/gwreg.R +++ b/r-package/morie/R/gwreg.R @@ -13,39 +13,49 @@ #' n, method. #' @references Brunsdon, Fotheringham & Charlton (1996). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' gwreg(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export gwreg <- function(x, y, coords, bandwidth = NULL, kernel = "gaussian") { - X <- as.matrix(x); y <- as.numeric(y); n <- length(y) - coords <- if (is.matrix(coords)) coords else + X <- as.matrix(x) + y <- as.numeric(y) + n <- length(y) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } k <- ncol(X) D <- as.matrix(stats::dist(coords)) if (is.null(bandwidth)) bandwidth <- stats::median(D[D > 0]) - betas <- matrix(0, n, k); ses <- matrix(0, n, k) + betas <- matrix(0, n, k) + ses <- matrix(0, n, k) for (i in seq_len(n)) { d <- D[i, ] - w <- if (kernel == "bisquare") - ifelse(d <= bandwidth, (1 - (d / bandwidth) ^ 2) ^ 2, 0) - else exp(-0.5 * (d / bandwidth) ^ 2) + w <- if (kernel == "bisquare") { + ifelse(d <= bandwidth, (1 - (d / bandwidth)^2)^2, 0) + } else { + exp(-0.5 * (d / bandwidth)^2) + } sw <- sqrt(w) - Xw <- X * sw; yw <- y * sw + Xw <- X * sw + yw <- y * sw XtWX <- crossprod(Xw) bi <- tryCatch(solve(XtWX, crossprod(Xw, yw)), - error = function(e) qr.solve(XtWX, crossprod(Xw, yw))) + error = function(e) qr.solve(XtWX, crossprod(Xw, yw)) + ) betas[i, ] <- as.numeric(bi) resid <- yw - Xw %*% bi df <- max(sum(w) - k, 1) - sigma2 <- as.numeric(sum(resid ^ 2)) / df + sigma2 <- as.numeric(sum(resid^2)) / df cov_i <- tryCatch(sigma2 * solve(XtWX), - error = function(e) matrix(NA_real_, k, k)) + error = function(e) matrix(NA_real_, k, k) + ) ses[i, ] <- sqrt(pmax(diag(cov_i), 0)) } - list(estimate = betas, se = ses, bandwidth = bandwidth, kernel = kernel, - n = n, method = sprintf("GWR (%s kernel)", kernel)) + list( + estimate = betas, se = ses, bandwidth = bandwidth, kernel = kernel, + n = n, method = sprintf("GWR (%s kernel)", kernel) + ) } # CANONICAL TEST @@ -55,4 +65,4 @@ gwreg <- function(x, y, coords, bandwidth = NULL, kernel = "gaussian") { #' @rdname gwreg #' @keywords internal #' @export -geographically_weighted_regression <- gwreg +morie_geographically_weighted_regression <- gwreg diff --git a/r-package/morie/R/gxemd.R b/r-package/morie/R/gxemd.R index 669bb6bd72..4f1038bd4b 100644 --- a/r-package/morie/R/gxemd.R +++ b/r-package/morie/R/gxemd.R @@ -8,24 +8,29 @@ #' @return list(estimate, g, e, ge, var_g, var_e, var_ge, var_eps, se, n, method). #' @references Montesinos Lopez Ch 11. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -gxe_interaction_model <- function(x, y, env) { - g_id <- x; e_id <- env; yv <- as.numeric(y); n <- length(yv) - g_levels <- unique(g_id); e_levels <- unique(e_id) - G <- length(g_levels); E <- length(e_levels) +morie_gxe_interaction_model <- function(x, y, env) { + g_id <- x + e_id <- env + yv <- as.numeric(y) + n <- length(yv) + g_levels <- unique(g_id) + e_levels <- unique(e_id) + G <- length(g_levels) + E <- length(e_levels) mu <- mean(yv) - g_eff <- sapply(g_levels, function(lv) mean(yv[g_id == lv]) - mu) - e_eff <- sapply(e_levels, function(lv) mean(yv[e_id == lv]) - mu) + g_eff <- vapply(g_levels, function(lv) mean(yv[g_id == lv]) - mu, numeric(1)) + e_eff <- vapply(e_levels, function(lv) mean(yv[e_id == lv]) - mu, numeric(1)) cell_mean <- matrix(NA_real_, G, E) cell_count <- matrix(0L, G, E) - for (i in seq_along(g_levels)) for (j in seq_along(e_levels)) { - msk <- g_id == g_levels[i] & e_id == e_levels[j] - cell_count[i, j] <- sum(msk) - if (any(msk)) cell_mean[i, j] <- mean(yv[msk]) + for (i in seq_along(g_levels)) { + for (j in seq_along(e_levels)) { + msk <- g_id == g_levels[i] & e_id == e_levels[j] + cell_count[i, j] <- sum(msk) + if (any(msk)) cell_mean[i, j] <- mean(yv[msk]) + } } ge_eff <- cell_mean - mu - outer(g_eff, e_eff, "+") + 0 # Equivalent: cell_mean - mu - g_eff[i] - e_eff[j] @@ -36,24 +41,31 @@ gxe_interaction_model <- function(x, y, env) { valid <- !is.na(ge_eff) ss_ge <- sum(cell_count[valid] * ge_eff[valid]^2) resid <- rep(0, n) - for (i in seq_along(g_levels)) for (j in seq_along(e_levels)) { - msk <- g_id == g_levels[i] & e_id == e_levels[j] - if (any(msk)) resid[msk] <- yv[msk] - cell_mean[i, j] + for (i in seq_along(g_levels)) { + for (j in seq_along(e_levels)) { + msk <- g_id == g_levels[i] & e_id == e_levels[j] + if (any(msk)) resid[msk] <- yv[msk] - cell_mean[i, j] + } } ss_eps <- sum(resid^2) - df_g <- max(G - 1, 1); df_e <- max(E - 1, 1) - df_ge <- max((G - 1) * (E - 1), 1); df_eps <- max(n - G * E, 1) - ms_eps <- ss_eps / df_eps; ms_ge <- ss_ge / df_ge + df_g <- max(G - 1, 1) + df_e <- max(E - 1, 1) + df_ge <- max((G - 1) * (E - 1), 1) + df_eps <- max(n - G * E, 1) + ms_eps <- ss_eps / df_eps + ms_ge <- ss_ge / df_ge var_eps <- ms_eps var_ge <- max(0, (ms_ge - ms_eps) / max(n_per_cell, 1)) var_g <- max(0, (ss_g / df_g - ms_ge) / max(E * n_per_cell, 1)) var_e <- max(0, (ss_e / df_e - ms_ge) / max(G * n_per_cell, 1)) - list(estimate = mu, g = g_eff, e = e_eff, ge = ge_eff, - var_g = var_g, var_e = var_e, var_ge = var_ge, var_eps = var_eps, - se = sqrt(var_eps), n = n, - method = "Two-way GxE ANOVA + EMS variance components") + list( + estimate = mu, g = g_eff, e = e_eff, ge = ge_eff, + var_g = var_g, var_e = var_e, var_ge = var_ge, var_eps = var_eps, + se = sqrt(var_eps), n = n, + method = "Two-way GxE ANOVA + EMS variance components" + ) } # CANONICAL TEST # x <- c(1,1,2,2,3,3,1,1,2,2,3,3); env <- c(1,1,1,1,1,1,2,2,2,2,2,2) -# y <- c(1,2,3,4,5,6,2,3,4,5,6,7); gxe_interaction_model(x, y, env) +# y <- c(1,2,3,4,5,6,2,3,4,5,6,7); morie_gxe_interaction_model(x, y, env) diff --git a/r-package/morie/R/hawkes_fit.R b/r-package/morie/R/hawkes_fit.R index c9e30ad7ea..36aad3714d 100644 --- a/r-package/morie/R/hawkes_fit.R +++ b/r-package/morie/R/hawkes_fit.R @@ -18,7 +18,8 @@ weibull = c("a0", "eta", "alpha", "lambda"), lomax = c("a0", "eta", "alpha", "c"), gamma = c("a0", "eta", "alpha", "beta"), - stop("unknown kernel: ", kernel)) + stop("unknown kernel: ", kernel) + ) } # Optimisation runs in an unconstrained space phi to avoid the hard @@ -41,13 +42,18 @@ .hawkes_nll_cpp <- function(theta, times, end_time, kernel) { switch(kernel, exponential = morie_hawkes_ll_exp_const_cpp( - times, end_time, theta[1], theta[2], theta[3]), + times, end_time, theta[1], theta[2], theta[3] + ), weibull = morie_hawkes_ll_weibull_const_cpp( - times, end_time, theta[1], theta[2], theta[3], theta[4]), + times, end_time, theta[1], theta[2], theta[3], theta[4] + ), lomax = morie_hawkes_ll_lomax_const_cpp( - times, end_time, theta[1], theta[2], theta[3], theta[4]), + times, end_time, theta[1], theta[2], theta[3], theta[4] + ), gamma = morie_hawkes_ll_gamma_const_cpp( - times, end_time, theta[1], theta[2], theta[3], theta[4])) + times, end_time, theta[1], theta[2], theta[3], theta[4] + ) + ) } # Triggering kernel g(u) and its integral G(u) = integral_0^u g, for the @@ -55,35 +61,59 @@ .hawkes_kernel_funs <- function(kernel, theta) { if (kernel == "exponential") { beta <- theta[3] - if (beta <= 1e-6) return(NULL) - list(g = function(u) beta * exp(-beta * u), - G = function(u) 1 - exp(-beta * u)) + if (beta <= 1e-6) { + return(NULL) + } + list( + g = function(u) beta * exp(-beta * u), + G = function(u) 1 - exp(-beta * u) + ) } else if (kernel == "weibull") { - alpha <- theta[3]; lam <- theta[4] - if (alpha <= 1e-6 || lam <= 1e-6) return(NULL) - list(g = function(u) (alpha / lam) * (u / lam)^(alpha - 1) * - exp(-(u / lam)^alpha), - G = function(u) 1 - exp(-(u / lam)^alpha)) + alpha <- theta[3] + lam <- theta[4] + if (alpha <= 1e-6 || lam <= 1e-6) { + return(NULL) + } + list( + g = function(u) { + (alpha / lam) * (u / lam)^(alpha - 1) * + exp(-(u / lam)^alpha) + }, + G = function(u) 1 - exp(-(u / lam)^alpha) + ) } else if (kernel == "lomax") { - alpha <- theta[3]; cc <- theta[4] - if (alpha <= 1.001 || cc <= 1e-6) return(NULL) - list(g = function(u) ((alpha - 1) / cc) * (1 + u / cc)^(-alpha), - G = function(u) 1 - (1 + u / cc)^(-(alpha - 1))) + alpha <- theta[3] + cc <- theta[4] + if (alpha <= 1.001 || cc <= 1e-6) { + return(NULL) + } + list( + g = function(u) ((alpha - 1) / cc) * (1 + u / cc)^(-alpha), + G = function(u) 1 - (1 + u / cc)^(-(alpha - 1)) + ) } else { - alpha <- theta[3]; beta <- theta[4] - if (alpha <= 1e-6 || beta <= 1e-6) return(NULL) - list(g = function(u) stats::dgamma(u, shape = alpha, rate = beta), - G = function(u) stats::pgamma(u, shape = alpha, rate = beta)) + alpha <- theta[3] + beta <- theta[4] + if (alpha <= 1e-6 || beta <= 1e-6) { + return(NULL) + } + list( + g = function(u) stats::dgamma(u, shape = alpha, rate = beta), + G = function(u) stats::pgamma(u, shape = alpha, rate = beta) + ) } } .hawkes_nll_pureR <- function(theta, times, end_time, kernel) { - nu <- exp(theta[1]); eta <- theta[2] + nu <- exp(theta[1]) + eta <- theta[2] if (!is.finite(nu) || nu <= 0 || eta <= 1e-6 || eta >= 0.999) { return(1e12) } funs <- .hawkes_kernel_funs(kernel, theta) - if (is.null(funs)) return(1e12) + if (is.null(funs)) { + return(1e12) + } n <- length(times) log_sum <- 0.0 for (i in seq_len(n)) { @@ -91,24 +121,29 @@ if (i > 1L) { lam <- nu + eta * sum(funs$g(times[i] - times[seq_len(i - 1L)])) } - if (!is.finite(lam) || lam <= 0) return(1e12) + if (!is.finite(lam) || lam <= 0) { + return(1e12) + } log_sum <- log_sum + log(lam) } integral <- nu * end_time + eta * sum(funs$G(end_time - times)) - if (!is.finite(log_sum) || !is.finite(integral)) return(1e12) + if (!is.finite(log_sum) || !is.finite(integral)) { + return(1e12) + } -(log_sum - integral) } .hawkes_start <- function(kernel, times, end_time) { n <- length(times) - dt_bar <- end_time / n # mean inter-arrival + dt_bar <- end_time / n # mean inter-arrival a0 <- log(max(0.5 * n / end_time, 1e-3)) eta <- 0.3 switch(kernel, exponential = c(a0, eta, 1 / dt_bar), weibull = c(a0, eta, 1.2, dt_bar), lomax = c(a0, eta, 2.0, dt_bar), - gamma = c(a0, eta, 1.5, 1 / dt_bar)) + gamma = c(a0, eta, 1.5, 1 / dt_bar) + ) } # Closed-form log-likelihood of the homogeneous Poisson submodel @@ -130,7 +165,8 @@ c(0, 2.0, 0, 0), c(0, -2.5, 0, 0), c(0.7, 0, 0.7, 0), - c(-0.7, 1.0, -0.7, 0.5)) + c(-0.7, 1.0, -0.7, 0.5) + ) lapply(offsets, function(o) phi0 + o[seq_along(phi0)]) } @@ -155,16 +191,16 @@ #' \code{branching_ratio}, \code{baseline_rate}, \code{n_events}, #' \code{converged} and the \code{backend} used. #' @examples -#' \dontrun{ -#' set.seed(1) -#' ev <- cumsum(rexp(200, rate = 2)) -#' fit <- morie_hawkes_fit(ev, kernel = "exponential") -#' print(fit) -#' } +#' set.seed(1) +#' ev <- cumsum(rexp(200, rate = 2)) +#' fit <- morie_hawkes_fit(ev, kernel = "exponential") +#' print(fit) #' @export morie_hawkes_fit <- function(times, end_time = NULL, - kernel = c("exponential", "weibull", - "lomax", "gamma")) { + kernel = c( + "exponential", "weibull", + "lomax", "gamma" + )) { kernel <- match.arg(kernel) times <- as.numeric(times) if (anyNA(times) || any(diff(times) < 0)) { @@ -195,9 +231,12 @@ morie_hawkes_fit <- function(times, end_time = NULL, best <- NULL for (phi_s in .hawkes_restarts(phi0)) { run <- tryCatch( - stats::optim(phi_s, obj, method = "Nelder-Mead", - control = list(maxit = 2000, reltol = 1e-10)), - error = function(e) NULL) + stats::optim(phi_s, obj, + method = "Nelder-Mead", + control = list(maxit = 2000, reltol = 1e-10) + ), + error = function(e) NULL + ) if (is.null(run) || !is.finite(run$value)) next if (is.null(best) || run$value < best$value) best <- run } @@ -218,43 +257,63 @@ morie_hawkes_fit <- function(times, end_time = NULL, # a flat ridge in the unidentified directions. degenerate <- eta < 1e-3 structure( - list(kernel = kernel, baseline = "constant", - estimate = est, - branching_ratio = eta, - baseline_rate = exp(unname(est[["a0"]])), - loglik = loglik, aic = 2 * k - 2 * loglik, - loglik_poisson = loglik_pois, - loglik_gain = loglik - loglik_pois, - self_excitation_detected = !degenerate, - n_events = n, end_time = end_time, - converged = best$convergence == 0L || degenerate, - note = if (degenerate) - paste("eta collapsed to ~0: data consistent with a", - "homogeneous Poisson process; kernel-shape", - "parameters are NOT identified") - else NULL, - backend = if (use_cpp) "cpp" else "pure-R"), - class = "morie_hawkes_fit") + list( + kernel = kernel, baseline = "constant", + estimate = est, + branching_ratio = eta, + baseline_rate = exp(unname(est[["a0"]])), + loglik = loglik, aic = 2 * k - 2 * loglik, + loglik_poisson = loglik_pois, + loglik_gain = loglik - loglik_pois, + self_excitation_detected = !degenerate, + n_events = n, end_time = end_time, + converged = best$convergence == 0L || degenerate, + note = if (degenerate) { + paste( + "eta collapsed to ~0: data consistent with a", + "homogeneous Poisson process; kernel-shape", + "parameters are NOT identified" + ) + } else { + NULL + }, + backend = if (use_cpp) "cpp" else "pure-R" + ), + class = "morie_hawkes_fit" + ) } #' @export print.morie_hawkes_fit <- function(x, ...) { - cat(sprintf("morie Hawkes fit -- %s kernel, %s baseline\n", - x$kernel, x$baseline)) - cat(sprintf(" events: %d horizon: %.4g backend: %s\n", - x$n_events, x$end_time, x$backend)) + cat(sprintf( + "morie Hawkes fit -- %s kernel, %s baseline\n", + x$kernel, x$baseline + )) + cat(sprintf( + " events: %d horizon: %.4g backend: %s\n", + x$n_events, x$end_time, x$backend + )) cat(" estimate:\n") for (nm in names(x$estimate)) { cat(sprintf(" %-8s % .6g\n", nm, x$estimate[[nm]])) } cat(sprintf(" baseline rate nu : %.6g\n", x$baseline_rate)) - cat(sprintf(" branching ratio : %.6g%s\n", x$branching_ratio, - if (x$branching_ratio >= 1) " (NOT stationary: eta >= 1)" - else "")) - cat(sprintf(" logLik: %.6g AIC: %.6g converged: %s\n", - x$loglik, x$aic, x$converged)) - cat(sprintf(" vs Poisson: logLik %.6g (gain %+.4g)\n", - x$loglik_poisson, x$loglik_gain)) + cat(sprintf( + " branching ratio : %.6g%s\n", x$branching_ratio, + if (x$branching_ratio >= 1) { + " (NOT stationary: eta >= 1)" + } else { + "" + } + )) + cat(sprintf( + " logLik: %.6g AIC: %.6g converged: %s\n", + x$loglik, x$aic, x$converged + )) + cat(sprintf( + " vs Poisson: logLik %.6g (gain %+.4g)\n", + x$loglik_poisson, x$loglik_gain + )) if (!isTRUE(x$self_excitation_detected)) { cat(" NOTE: ", x$note, "\n", sep = "") } diff --git a/r-package/morie/R/heinz.R b/r-package/morie/R/heinz.R index 86c85ce2ee..91c5de2658 100644 --- a/r-package/morie/R/heinz.R +++ b/r-package/morie/R/heinz.R @@ -18,12 +18,10 @@ #' @return Named list \code{(W, estimate, mean, std, shape, method)}. #' @references He, Zhang, Ren & Sun (2015), ICCV. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -heinz_he_initialization <- function(fan_in, fan_out = NULL, seed = 42L, +morie_heinz_he_initialization <- function(fan_in, fan_out = NULL, seed = 42L, mode = "normal", deterministic_seed = NULL) { fan_in <- as.integer(fan_in) @@ -50,12 +48,14 @@ heinz_he_initialization <- function(fan_in, fan_out = NULL, seed = 42L, stop(sprintf("mode must be 'normal' or 'uniform', got %s", mode)) } if (!is.null(fan_out)) W <- matrix(W, nrow = fan_out, ncol = fan_in) - list(W = W, estimate = W, mean = mean(W), std = stats::sd(W), - shape = shape, - method = sprintf("He initialization (%s)", mode)) + list( + W = W, estimate = W, mean = mean(W), std = stats::sd(W), + shape = shape, + method = sprintf("He initialization (%s)", mode) + ) } -#' @rdname heinz_he_initialization +#' @rdname morie_heinz_he_initialization #' @keywords internal #' @export -he_initialization <- heinz_he_initialization +morie_he_initialization <- morie_heinz_he_initialization diff --git a/r-package/morie/R/hrzb1.R b/r-package/morie/R/hrzb1.R index b456ac4e4a..9b047448c9 100644 --- a/r-package/morie/R/hrzb1.R +++ b/r-package/morie/R/hrzb1.R @@ -1,5 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: Manski maximum-score objective (negative average concordance +# of the sign of the index with the +-1 response). Extracted from the +# hrzb1() optimiser closures so it can be unit-tested directly. +.hrzb1_score <- function(b, ys, X) { + -mean(ys * (X %*% b > 0)) +} + #' Manski (1975) maximum-score estimator #' #' @param x Numeric covariate vector or design matrix. @@ -9,47 +16,66 @@ hrzb1 <- function(x, y) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "maximum-score (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "maximum-score (insufficient data)" + )) + } ys <- 2 * y - 1 - score <- function(b) -mean(ys * (X %*% b > 0)) + score <- function(b) .hrzb1_score(b, ys, X) beta0 <- as.numeric(stats::coef(stats::lm.fit(X, ys))) - nrm <- sqrt(sum(beta0^2)); if (nrm > 1e-12) beta0 <- beta0 / nrm + nrm <- sqrt(sum(beta0^2)) + if (nrm > 1e-12) beta0 <- beta0 / nrm if (beta0[1] < 0) beta0 <- -beta0 - best <- beta0; best_l <- score(best) + best <- beta0 + best_l <- score(best) set.seed(0) for (k in 1:8) { - s <- stats::rnorm(p); s <- s / sqrt(sum(s^2)) - r <- stats::optim(s, score, method = "Nelder-Mead", - control = list(maxit = 300, reltol = 1e-4)) + s <- stats::rnorm(p) + s <- s / sqrt(sum(s^2)) + r <- stats::optim(s, score, + method = "Nelder-Mead", + control = list(maxit = 300, reltol = 1e-4) + ) b <- r$par / max(sqrt(sum(r$par^2)), 1e-12) if (b[1] < 0) b <- -b l <- score(b) - if (l < best_l) { best_l <- l; best <- b } + if (l < best_l) { + best_l <- l + best <- b + } } # Subsample SE (cube-root rescale) - set.seed(42); B <- 30; m <- max(20L, n %/% 2L) + set.seed(42) + B <- 30 + m <- max(20L, n %/% 2L) boot <- matrix(0, B, p) for (b_idx in 1:B) { idx <- sample.int(n, m, replace = FALSE) - Xb <- X[idx, , drop = FALSE]; yb <- ys[idx] - sc <- function(b) -mean(yb * (Xb %*% b > 0)) - r <- stats::optim(best + 0.05 * stats::rnorm(p), sc, method = "Nelder-Mead", - control = list(maxit = 150)) + Xb <- X[idx, , drop = FALSE] + yb <- ys[idx] + sc <- function(b) .hrzb1_score(b, yb, Xb) + r <- stats::optim(best + 0.05 * stats::rnorm(p), sc, + method = "Nelder-Mead", + control = list(maxit = 150) + ) bb <- r$par / max(sqrt(sum(r$par^2)), 1e-12) if (bb[1] < 0) bb <- -bb boot[b_idx, ] <- bb } - se <- apply(boot, 2, stats::sd) * (m / n)^(1/3) - list(estimate = best, se = se, score = -best_l, n = n, - method = "Manski (1975) maximum-score (binary response)", - warnings = list("Cube-root asymptotics: subsample-rescaled SEs.")) + se <- apply(boot, 2, stats::sd) * (m / n)^(1 / 3) + list( + estimate = best, se = se, score = -best_l, n = n, + method = "Manski (1975) maximum-score (binary response)", + warnings = list("Cube-root asymptotics: subsample-rescaled SEs.") + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzb1 #' @keywords internal #' @export -horowitz_binary_response <- hrzb1 +morie_horowitz_binary_response <- hrzb1 diff --git a/r-package/morie/R/hrzb2.R b/r-package/morie/R/hrzb2.R index 6396805367..638767f555 100644 --- a/r-package/morie/R/hrzb2.R +++ b/r-package/morie/R/hrzb2.R @@ -1,5 +1,17 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: smoothed maximum-score loss. Extracted from the hrzb2() +# optimiser closure so the zero-norm guard is directly unit-testable. +.hrzb2_loss <- function(b, X, ys, h) { + nb <- sqrt(sum(b^2)) + if (nb < 1e-12) { + return(1e12) + } + bn <- b / nb + z <- (X %*% bn) / h + -mean(ys * stats::pnorm(z)) +} + #' Horowitz (1992) smoothed maximum-score estimator #' #' @param x Numeric covariate vector or design matrix. @@ -10,23 +22,29 @@ hrzb2 <- function(x, y, bandwidth = NULL) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "smoothed-max-score (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "smoothed-max-score (insufficient data)" + )) + } ys <- 2 * y - 1 - h <- if (is.null(bandwidth)) max(.hrz_silverman(X %*% rep(1 / sqrt(p), p)), 1e-3) - else as.numeric(bandwidth) - loss <- function(b) { - nb <- sqrt(sum(b^2)); if (nb < 1e-12) return(1e12) - bn <- b / nb; z <- (X %*% bn) / h - -mean(ys * stats::pnorm(z)) + h <- if (is.null(bandwidth)) { + max(.hrz_silverman(X %*% rep(1 / sqrt(p), p)), 1e-3) + } else { + as.numeric(bandwidth) } + loss <- function(b) .hrzb2_loss(b, X, ys, h) beta0 <- as.numeric(stats::coef(stats::lm.fit(X, ys))) - nrm <- sqrt(sum(beta0^2)); if (nrm > 1e-12) beta0 <- beta0 / nrm + nrm <- sqrt(sum(beta0^2)) + if (nrm > 1e-12) beta0 <- beta0 / nrm if (beta0[1] < 0) beta0 <- -beta0 - res <- stats::optim(beta0, loss, method = "BFGS", - control = list(maxit = 200)) + res <- stats::optim(beta0, loss, + method = "BFGS", + control = list(maxit = 200) + ) bh <- res$par / max(sqrt(sum(res$par^2)), 1e-12) if (bh[1] < 0) bh <- -bh z <- (X %*% bh) / h @@ -34,14 +52,17 @@ hrzb2 <- function(x, y, bandwidth = NULL) { score_i <- -as.numeric(ys * phi) * X / h info <- t(score_i) %*% score_i / n cov_m <- tryCatch(MASS::ginv(info) / n, - error = function(e) matrix(NA, p, p)) + error = function(e) matrix(NA, p, p) + ) se <- sqrt(pmax(diag(cov_m), 0)) - list(estimate = bh, se = se, bandwidth = h, n = n, - method = "Horowitz (1992) smoothed maximum-score") + list( + estimate = bh, se = se, bandwidth = h, n = n, + method = "Horowitz (1992) smoothed maximum-score" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzb2 #' @keywords internal #' @export -horowitz_smoothed_maximum_score <- hrzb2 +morie_horowitz_smoothed_maximum_score <- hrzb2 diff --git a/r-package/morie/R/hrzc1.R b/r-package/morie/R/hrzc1.R index d1d889295c..156b92f9cd 100644 --- a/r-package/morie/R/hrzc1.R +++ b/r-package/morie/R/hrzc1.R @@ -10,42 +10,58 @@ hrzc1 <- function(x, y, censor = 0.0) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X); c <- as.numeric(censor) - if (n < max(10, 2 * p)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "CLAD (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + c <- as.numeric(censor) + if (n < max(10, 2 * p)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "CLAD (insufficient data)" + )) + } keep <- y > c - if (sum(keep) < max(5, p + 1)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "CLAD (too few uncensored obs)")) + if (sum(keep) < max(5, p + 1)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "CLAD (too few uncensored obs)" + )) + } beta <- .hrz_qreg_irls(X[keep, , drop = FALSE], y[keep]) for (k in 1:30) { active <- as.numeric(X %*% beta) > c if (sum(active) < max(5, p + 1)) break new <- .hrz_qreg_irls(X[active, , drop = FALSE], y[active]) - if (max(abs(new - beta)) < 1e-5) { beta <- new; break } + if (max(abs(new - beta)) < 1e-5) { + beta <- new + break + } beta <- new } - r <- y - as.numeric(X %*% beta); active <- as.numeric(X %*% beta) > c + r <- y - as.numeric(X %*% beta) + active <- as.numeric(X %*% beta) > c if (sum(active) < max(5, p + 1)) { se <- rep(NA_real_, p) } else { - Xa <- X[active, , drop = FALSE]; ra <- r[active] - h <- max(1.06 * stats::sd(ra) * length(ra)^(-1/5), 1e-4) + Xa <- X[active, , drop = FALSE] + ra <- r[active] + h <- max(1.06 * stats::sd(ra) * length(ra)^(-1 / 5), 1e-4) f0 <- mean(exp(-0.5 * (ra / h)^2) / (h * sqrt(2 * pi))) A <- t(Xa) %*% Xa * f0 cov_m <- tryCatch(0.25 * MASS::ginv(A) %*% (t(Xa) %*% Xa) %*% MASS::ginv(A), - error = function(e) matrix(NA, p, p)) + error = function(e) matrix(NA, p, p) + ) se <- sqrt(pmax(diag(cov_m), 0)) } - list(estimate = if (p == 1) as.numeric(beta) else as.numeric(beta), - se = if (p == 1) as.numeric(se) else as.numeric(se), - n = n, n_uncensored = as.integer(sum(active)), censor = c, - method = "Powell (1984) censored LAD") + list( + estimate = if (p == 1) as.numeric(beta) else as.numeric(beta), + se = if (p == 1) as.numeric(se) else as.numeric(se), + n = n, n_uncensored = as.integer(sum(active)), censor = c, + method = "Powell (1984) censored LAD" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzc1 #' @keywords internal #' @export -horowitz_censored_regression <- hrzc1 +morie_horowitz_censored_regression <- hrzc1 diff --git a/r-package/morie/R/hrzd1.R b/r-package/morie/R/hrzd1.R index 81cec8b438..5ae9487af9 100644 --- a/r-package/morie/R/hrzd1.R +++ b/r-package/morie/R/hrzd1.R @@ -8,14 +8,20 @@ #' @return Named list with estimate, se, n, n_events, method. #' @keywords internal hrzd1 <- function(t, x, event) { - t <- as.numeric(t); event <- as.numeric(event) + t <- as.numeric(t) + event <- as.numeric(event) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p) || length(t) != n || length(event) != n) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "Cox PH (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p) || length(t) != n || length(event) != n) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "Cox PH (insufficient data)" + )) + } o <- order(-t) - Xs <- X[o, , drop = FALSE]; ev <- event[o] + Xs <- X[o, , drop = FALSE] + ev <- event[o] beta <- rep(0, p) H <- diag(p) for (it in 1:50) { @@ -29,30 +35,37 @@ hrzd1 <- function(t, x, event) { score <- colSums(ev * diff_X) # build cumulative S2 (n, p, p) S2 <- array(0, c(n, p, p)) - for (i in 1:p) for (j in 1:p) { - S2[, i, j] <- cumsum(Xs[, i] * Xs[, j] * ehb) + for (i in 1:p) { + for (j in 1:p) { + S2[, i, j] <- cumsum(Xs[, i] * Xs[, j] * ehb) + } } var_X <- array(0, c(n, p, p)) - for (i in 1:p) for (j in 1:p) { - var_X[, i, j] <- S2[, i, j] / pmax(S0, 1e-12) - mean_X[, i] * mean_X[, j] + for (i in 1:p) { + for (j in 1:p) { + var_X[, i, j] <- S2[, i, j] / pmax(S0, 1e-12) - mean_X[, i] * mean_X[, j] + } } info <- matrix(0, p, p) for (i in 1:p) for (j in 1:p) info[i, j] <- sum(ev * var_X[, i, j]) step <- tryCatch(solve(info + 1e-8 * diag(p), score), - error = function(e) MASS::ginv(info) %*% score) + error = function(e) MASS::ginv(info) %*% score + ) beta <- beta + as.numeric(step) if (max(abs(step)) < 1e-6) break } cov_m <- tryCatch(MASS::ginv(info), error = function(e) matrix(NA, p, p)) se <- sqrt(pmax(diag(cov_m), 0)) - list(estimate = if (p == 1) as.numeric(beta) else as.numeric(beta), - se = if (p == 1) as.numeric(se) else as.numeric(se), - n = n, n_events = as.integer(sum(event)), - method = "Cox proportional hazards (partial likelihood)") + list( + estimate = if (p == 1) as.numeric(beta) else as.numeric(beta), + se = if (p == 1) as.numeric(se) else as.numeric(se), + n = n, n_events = as.integer(sum(event)), + method = "Cox proportional hazards (partial likelihood)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzd1 #' @keywords internal #' @export -horowitz_duration_model <- hrzd1 +morie_horowitz_duration_model <- hrzd1 diff --git a/r-package/morie/R/hrzi1.R b/r-package/morie/R/hrzi1.R index 4975565b7a..3e4ea83a73 100644 --- a/r-package/morie/R/hrzi1.R +++ b/r-package/morie/R/hrzi1.R @@ -1,5 +1,24 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: Ichimura single-index leave-one-out objective. Extracted +# from the hrzi1() optimiser closure so the zero-norm guard is directly +# unit-testable. `h0` is the kernel bandwidth. +.hrzi1_obj <- function(b, X, y, h0) { + nb <- sqrt(sum(b^2)) + if (nb < 1e-12) { + return(1e12) + } + bn <- b / nb + idx <- as.numeric(X %*% bn) + u <- outer(idx, idx, `-`) / h0 + w <- exp(-0.5 * u^2) + diag(w) <- 0 + wsum <- rowSums(w) + safe <- ifelse(wsum > 0, wsum, 1) + g_hat <- as.numeric((w %*% y) / safe) + mean((y - g_hat)^2) +} + #' Ichimura (1993) single-index model #' #' @param x Numeric covariate vector or design matrix. @@ -10,47 +29,59 @@ hrzi1 <- function(x, y, bandwidth = NULL) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "single-index (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "single-index (insufficient data)" + )) + } beta0 <- as.numeric(stats::coef(stats::lm.fit(X, y))) nrm <- sqrt(sum(beta0^2)) if (nrm < 1e-10) beta0 <- rep(1, p) / sqrt(p) else beta0 <- beta0 / nrm if (beta0[1] < 0) beta0 <- -beta0 h0 <- if (is.null(bandwidth)) .hrz_silverman(X %*% beta0) else as.numeric(bandwidth) - obj <- function(b) { - nb <- sqrt(sum(b^2)); if (nb < 1e-12) return(1e12) - bn <- b / nb; idx <- as.numeric(X %*% bn) - u <- outer(idx, idx, `-`) / h0 - w <- exp(-0.5 * u^2); diag(w) <- 0 - wsum <- rowSums(w); safe <- ifelse(wsum > 0, wsum, 1) - g_hat <- as.numeric((w %*% y) / safe) - mean((y - g_hat)^2) - } - res <- stats::optim(beta0, obj, method = "Nelder-Mead", - control = list(maxit = 200, reltol = 1e-5)) - bh <- res$par; bh <- bh / max(sqrt(sum(bh^2)), 1e-12) + obj <- function(b) .hrzi1_obj(b, X, y, h0) + res <- stats::optim(beta0, obj, + method = "Nelder-Mead", + control = list(maxit = 200, reltol = 1e-5) + ) + bh <- res$par + bh <- bh / max(sqrt(sum(bh^2)), 1e-12) if (bh[1] < 0) bh <- -bh # Numerical Hessian for SE - eps <- 1e-4; H <- matrix(0, p, p) - for (i in 1:p) for (j in 1:p) { - bp <- bh; bp[i] <- bp[i] + eps; bp[j] <- bp[j] + eps - bm <- bh; bm[i] <- bm[i] - eps; bm[j] <- bm[j] - eps - bpm <- bh; bpm[i] <- bpm[i] + eps; bpm[j] <- bpm[j] - eps - bmp <- bh; bmp[i] <- bmp[i] - eps; bmp[j] <- bmp[j] + eps - H[i, j] <- (obj(bp) - obj(bpm) - obj(bmp) + obj(bm)) / (4 * eps^2) + eps <- 1e-4 + H <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + bp <- bh + bp[i] <- bp[i] + eps + bp[j] <- bp[j] + eps + bm <- bh + bm[i] <- bm[i] - eps + bm[j] <- bm[j] - eps + bpm <- bh + bpm[i] <- bpm[i] + eps + bpm[j] <- bpm[j] - eps + bmp <- bh + bmp[i] <- bmp[i] - eps + bmp[j] <- bmp[j] + eps + H[i, j] <- (obj(bp) - obj(bpm) - obj(bmp) + obj(bm)) / (4 * eps^2) + } } H <- 0.5 * (H + t(H)) cov_m <- tryCatch(MASS::ginv(H) / n, error = function(e) matrix(NA, p, p)) se <- sqrt(pmax(diag(cov_m), 0)) - list(estimate = bh, se = se, bandwidth = h0, n = n, loss = res$value, - method = "Ichimura (1993) single-index model") + list( + estimate = bh, se = se, bandwidth = h0, n = n, loss = res$value, + method = "Ichimura (1993) single-index model" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzi1 #' @keywords internal #' @export -horowitz_index_model <- hrzi1 +morie_horowitz_index_model <- hrzi1 diff --git a/r-package/morie/R/hrzi2.R b/r-package/morie/R/hrzi2.R index 978ab4de24..9c15a0edcf 100644 --- a/r-package/morie/R/hrzi2.R +++ b/r-package/morie/R/hrzi2.R @@ -10,17 +10,21 @@ hrzi2 <- function(x, y, bandwidth = NULL) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(20, 2 * p)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, method = "avg-deriv (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(20, 2 * p)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, method = "avg-deriv (insufficient data)" + )) + } h <- if (is.null(bandwidth)) .hrz_silverman(X[, 1]) else as.numeric(bandwidth) if (h <= 0) h <- max(.hrz_silverman(X[, 1]), 1e-6) # Pairwise differences diffs <- array(0, c(n, n, p)) for (j in seq_len(p)) diffs[, , j] <- outer(X[, j], X[, j], `-`) sq <- apply(diffs^2, c(1, 2), sum) / (h^2) - K <- exp(-0.5 * sq) / ((2 * pi)^(p/2) * h^p) + K <- exp(-0.5 * sq) / ((2 * pi)^(p / 2) * h^p) diag(K) <- 0 grad_f <- matrix(0, n, p) for (j in seq_len(p)) grad_f[, j] <- -rowSums(diffs[, , j] * K) / (n * h^2) @@ -31,14 +35,16 @@ hrzi2 <- function(x, y, bandwidth = NULL) { } else { se <- sqrt(pmax(diag(stats::cov(psi)) / n, 0)) } - list(estimate = if (p == 1) as.numeric(delta) else as.numeric(delta), - se = if (p == 1) as.numeric(se) else as.numeric(se), - bandwidth = h, n = n, - method = "Powell-Stock-Stoker density-weighted average derivative") + list( + estimate = if (p == 1) as.numeric(delta) else as.numeric(delta), + se = if (p == 1) as.numeric(se) else as.numeric(se), + bandwidth = h, n = n, + method = "Powell-Stock-Stoker density-weighted average derivative" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzi2 #' @keywords internal #' @export -horowitz_average_derivative <- hrzi2 +morie_horowitz_average_derivative <- hrzi2 diff --git a/r-package/morie/R/hrzk1.R b/r-package/morie/R/hrzk1.R index 3fa806b754..d4965b2033 100644 --- a/r-package/morie/R/hrzk1.R +++ b/r-package/morie/R/hrzk1.R @@ -9,27 +9,35 @@ #' @keywords internal hrzk1 <- function(x, bandwidth = NULL, sample = NULL) { if (is.null(sample)) { - data <- as.numeric(x); grid <- data + data <- as.numeric(x) + grid <- data } else { - data <- as.numeric(sample); grid <- as.numeric(x) + data <- as.numeric(sample) + grid <- as.numeric(x) } n <- length(data) - if (n < 2) return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "kernel-density (insufficient data)")) + if (n < 2) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "kernel-density (insufficient data)" + )) + } h <- if (is.null(bandwidth)) .hrz_silverman(data) else as.numeric(bandwidth) if (h <= 0) h <- .hrz_silverman(data) diffs <- outer(grid, data, `-`) / h w <- exp(-0.5 * diffs^2) / sqrt(2 * pi) f_hat <- rowMeans(w) / h se <- sqrt(pmax(f_hat, 0) * .hrz_R_K_gaussian / (n * h)) - list(estimate = if (length(f_hat) == 1) as.numeric(f_hat) else f_hat, - se = if (length(se) == 1) as.numeric(se) else se, - bandwidth = h, n = n, kernel = "gaussian", - method = "Rosenblatt-Parzen kernel density") + list( + estimate = if (length(f_hat) == 1) as.numeric(f_hat) else f_hat, + se = if (length(se) == 1) as.numeric(se) else se, + bandwidth = h, n = n, kernel = "gaussian", + method = "Rosenblatt-Parzen kernel density" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzk1 #' @keywords internal #' @export -horowitz_kernel_density <- hrzk1 +morie_horowitz_kernel_density <- hrzk1 diff --git a/r-package/morie/R/hrzk2.R b/r-package/morie/R/hrzk2.R index 8f538d7664..148ef5e436 100644 --- a/r-package/morie/R/hrzk2.R +++ b/r-package/morie/R/hrzk2.R @@ -9,28 +9,36 @@ #' @return Named list with estimate, se, bandwidth, n, method. #' @keywords internal hrzk2 <- function(x, y, bandwidth = NULL, grid = NULL) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - if (n < 2 || length(y) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "NW (insufficient data)")) + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + if (n < 2 || length(y) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "NW (insufficient data)" + )) + } h <- if (is.null(bandwidth)) .hrz_silverman(x) else as.numeric(bandwidth) if (h <= 0) h <- .hrz_silverman(x) g <- if (is.null(grid)) x else as.numeric(grid) u <- outer(g, x, `-`) / h w <- exp(-0.5 * u^2) - wsum <- rowSums(w); safe <- ifelse(wsum > 0, wsum, 1) + wsum <- rowSums(w) + safe <- ifelse(wsum > 0, wsum, 1) m_hat <- (w %*% y) / safe resid <- outer(rep(1, length(g)), y) - matrix(m_hat, length(g), n) sigma2 <- rowSums(w * resid^2) / safe f_hat <- wsum / (n * h * sqrt(2 * pi)) se <- sqrt(pmax(sigma2, 0) * .hrz_R_K_gaussian / (n * h * pmax(f_hat, 1e-12))) - list(estimate = as.numeric(m_hat), se = as.numeric(se), - bandwidth = h, n = n, - method = "Nadaraya-Watson kernel regression (Gaussian)") + list( + estimate = as.numeric(m_hat), se = as.numeric(se), + bandwidth = h, n = n, + method = "Nadaraya-Watson kernel regression (Gaussian)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzk2 #' @keywords internal #' @export -horowitz_kernel_regression <- hrzk2 +morie_horowitz_kernel_regression <- hrzk2 diff --git a/r-package/morie/R/hrzk3.R b/r-package/morie/R/hrzk3.R index 445544a63a..912a2d9a9e 100644 --- a/r-package/morie/R/hrzk3.R +++ b/r-package/morie/R/hrzk3.R @@ -9,37 +9,50 @@ #' @return Named list with estimate, se, bandwidth, n, method. #' @keywords internal hrzk3 <- function(x, y, bandwidth = NULL, grid = NULL) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - if (n < 3 || length(y) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "local-linear (insufficient data)")) + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + if (n < 3 || length(y) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "local-linear (insufficient data)" + )) + } h <- if (is.null(bandwidth)) .hrz_silverman(x) else as.numeric(bandwidth) if (h <= 0) h <- .hrz_silverman(x) g <- if (is.null(grid)) x else as.numeric(grid) - m_hat <- numeric(length(g)); se <- numeric(length(g)) + m_hat <- numeric(length(g)) + se <- numeric(length(g)) for (i in seq_along(g)) { u <- (x - g[i]) / h w <- exp(-0.5 * u^2) - if (sum(w) <= 1e-12) { m_hat[i] <- NA; se[i] <- NA; next } + if (sum(w) <= 1e-12) { + m_hat[i] <- NA + se[i] <- NA + next + } X <- cbind(1, x - g[i]) WX <- X * w XtWX <- t(X) %*% WX beta <- tryCatch(solve(XtWX, t(WX) %*% y), - error = function(e) MASS::ginv(XtWX) %*% (t(WX) %*% y)) + error = function(e) MASS::ginv(XtWX) %*% (t(WX) %*% y) + ) m_hat[i] <- beta[1] r <- y - X %*% beta sigma2 <- sum(w * r^2) / max(sum(w), 1e-12) f_hat <- sum(w) / (n * h * sqrt(2 * pi)) se[i] <- sqrt(max(sigma2, 0) * .hrz_R_K_gaussian / (n * h * max(f_hat, 1e-12))) } - list(estimate = if (length(m_hat) == 1) m_hat[1] else m_hat, - se = if (length(se) == 1) se[1] else se, - bandwidth = h, n = n, - method = "Local-linear regression (Gaussian kernel)") + list( + estimate = if (length(m_hat) == 1) m_hat[1] else m_hat, + se = if (length(se) == 1) se[1] else se, + bandwidth = h, n = n, + method = "Local-linear regression (Gaussian kernel)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzk3 #' @keywords internal #' @export -horowitz_local_linear <- hrzk3 +morie_horowitz_local_linear <- hrzk3 diff --git a/r-package/morie/R/hrzm1.R b/r-package/morie/R/hrzm1.R index cf3ac2fedd..290d701840 100644 --- a/r-package/morie/R/hrzm1.R +++ b/r-package/morie/R/hrzm1.R @@ -10,20 +10,27 @@ #' @return Named list with estimate (pi, mu, sigma), log_likelihood, n, k, iters, method. #' @keywords internal hrzm1 <- function(y, k = 2, maxit = 200, tol = 1e-6, seed = 0) { - y <- as.numeric(y); n <- length(y) - if (n < max(10, 3 * k)) - return(list(estimate = NA_real_, n = n, - method = "mixture-EM (insufficient data)")) + y <- as.numeric(y) + n <- length(y) + if (n < max(10, 3 * k)) { + return(list( + estimate = NA_real_, n = n, + method = "mixture-EM (insufficient data)" + )) + } set.seed(seed) mu <- as.numeric(stats::quantile(y, seq(0.1, 0.9, length.out = k))) sigma <- rep(stats::sd(y) / k + 1e-3, k) pii <- rep(1 / k, k) - ll_prev <- -Inf; it <- 0 + ll_prev <- -Inf + it <- 0 for (it in 1:maxit) { - comps <- sapply(1:k, function(j) pii[j] * stats::dnorm(y, mu[j], sigma[j])) - denom <- rowSums(comps); denom <- ifelse(denom > 0, denom, 1e-12) + comps <- vapply(1:k, function(j) pii[j] * stats::dnorm(y, mu[j], sigma[j]), numeric(length(y))) + denom <- rowSums(comps) + denom <- ifelse(denom > 0, denom, 1e-12) gamma_w <- comps / denom - Nk <- colSums(gamma_w); Nk <- ifelse(Nk > 0, Nk, 1e-12) + Nk <- colSums(gamma_w) + Nk <- ifelse(Nk > 0, Nk, 1e-12) mu <- colSums(gamma_w * y) / Nk sigma <- sqrt(colSums(gamma_w * (y - matrix(mu, n, k, byrow = TRUE))^2) / Nk) sigma <- pmax(sigma, 1e-4) @@ -32,14 +39,18 @@ hrzm1 <- function(y, k = 2, maxit = 200, tol = 1e-6, seed = 0) { if (abs(ll - ll_prev) < tol) break ll_prev <- ll } - list(estimate = list(pi = as.numeric(pii), mu = as.numeric(mu), - sigma = as.numeric(sigma)), - log_likelihood = ll_prev, n = n, k = k, iters = it, - method = sprintf("%d-component Gaussian mixture EM", k)) + list( + estimate = list( + pi = as.numeric(pii), mu = as.numeric(mu), + sigma = as.numeric(sigma) + ), + log_likelihood = ll_prev, n = n, k = k, iters = it, + method = sprintf("%d-component Gaussian mixture EM", k) + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzm1 #' @keywords internal #' @export -horowitz_mixture_model <- hrzm1 +morie_horowitz_mixture_model <- hrzm1 diff --git a/r-package/morie/R/hrzn1.R b/r-package/morie/R/hrzn1.R index 3912528c74..b4e045a59a 100644 --- a/r-package/morie/R/hrzn1.R +++ b/r-package/morie/R/hrzn1.R @@ -13,19 +13,25 @@ #' @keywords internal hrzn1 <- function(x, y, z, J = 5, alpha = 1e-3, grid = NULL, .bootstrap = TRUE) { - x <- as.numeric(x); y <- as.numeric(y); z <- as.numeric(z) + x <- as.numeric(x) + y <- as.numeric(y) + z <- as.numeric(z) n <- length(y) if (n < 50 || length(x) != n || length(z) != n) { # 2SLS fallback - Xc <- cbind(1, x); Zc <- cbind(1, z) + Xc <- cbind(1, x) + Zc <- cbind(1, z) Pz <- Zc %*% MASS::ginv(t(Zc) %*% Zc) %*% t(Zc) beta <- MASS::ginv(t(Xc) %*% Pz %*% Xc) %*% (t(Xc) %*% Pz %*% y) - return(list(estimate = as.numeric(beta[2]), se = NA_real_, n = n, - method = "NPIV fallback: linear 2SLS")) + return(list( + estimate = as.numeric(beta[2]), se = NA_real_, n = n, + method = "NPIV fallback: linear 2SLS" + )) } x_s <- (x - mean(x)) / max(stats::sd(x), 1e-6) z_s <- (z - mean(z)) / max(stats::sd(z), 1e-6) - Bx <- .hrz_hermite(x_s, J); Bz <- .hrz_hermite(z_s, J) + Bx <- .hrz_hermite(x_s, J) + Bz <- .hrz_hermite(z_s, J) M <- (t(Bz) %*% Bx) / n BzY <- as.numeric((t(Bz) %*% y) / n) BzBz <- (t(Bz) %*% Bz) / n @@ -40,25 +46,32 @@ hrzn1 <- function(x, y, z, J = 5, alpha = 1e-3, grid = NULL, g_hat <- as.numeric(Bx_g %*% coef) # Bootstrap SE (guarded against recursion explosion) if (.bootstrap) { - set.seed(0); B <- 30 + set.seed(0) + B <- 30 boot <- matrix(0, B, length(grid)) for (b in 1:B) { idx <- sample.int(n, n, replace = TRUE) - sub <- tryCatch(hrzn1(x[idx], y[idx], z[idx], J = J, alpha = alpha, - grid = grid, .bootstrap = FALSE), - error = function(e) list(estimate = g_hat)) + sub <- tryCatch( + hrzn1(x[idx], y[idx], z[idx], + J = J, alpha = alpha, + grid = grid, .bootstrap = FALSE + ), + error = function(e) list(estimate = g_hat) + ) boot[b, ] <- as.numeric(sub$estimate) } se <- apply(boot, 2, stats::sd) } else { se <- rep(NA_real_, length(grid)) } - list(estimate = g_hat, se = as.numeric(se), grid = grid, J = J, alpha = alpha, - n = n, method = "Series-Tikhonov NPIV on Hermite basis") + list( + estimate = g_hat, se = as.numeric(se), grid = grid, J = J, alpha = alpha, + n = n, method = "Series-Tikhonov NPIV on Hermite basis" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzn1 #' @keywords internal #' @export -horowitz_nonparametric_iv <- hrzn1 +morie_horowitz_nonparametric_iv <- hrzn1 diff --git a/r-package/morie/R/hrzn2.R b/r-package/morie/R/hrzn2.R index 44172b4b2b..5a19d01e57 100644 --- a/r-package/morie/R/hrzn2.R +++ b/r-package/morie/R/hrzn2.R @@ -11,33 +11,46 @@ #' @keywords internal hrzn2 <- function(y, sigma_u = 0.5, bandwidth = NULL, grid = NULL, noise = "laplace") { - y <- as.numeric(y); n <- length(y) - if (n < 30) return(list(estimate = NA_real_, n = n, - method = "deconvolution (insufficient data)")) - h <- if (is.null(bandwidth)) max(1.5 * stats::sd(y) * n^(-1/7), 1e-3) - else as.numeric(bandwidth) + y <- as.numeric(y) + n <- length(y) + if (n < 30) { + return(list( + estimate = NA_real_, n = n, + method = "deconvolution (insufficient data)" + )) + } + h <- if (is.null(bandwidth)) { + max(1.5 * stats::sd(y) * n^(-1 / 7), 1e-3) + } else { + as.numeric(bandwidth) + } if (is.null(grid)) grid <- seq(min(y), max(y), length.out = 51) grid <- as.numeric(grid) - T <- seq(-15, 15, length.out = 2049) / max(h, 1e-3) - dt <- T[2] - T[1] - phi_Y <- colMeans(exp(1i * outer(y, T))) - phi_U <- if (noise == "normal") exp(-0.5 * (sigma_u * T)^2) - else 1 / (1 + (sigma_u * T)^2) - th <- T * h + t_grid <- seq(-15, 15, length.out = 2049) / max(h, 1e-3) + dt <- t_grid[2] - t_grid[1] + phi_Y <- colMeans(exp(1i * outer(y, t_grid))) + phi_U <- if (noise == "normal") { + exp(-0.5 * (sigma_u * t_grid)^2) + } else { + 1 / (1 + (sigma_u * t_grid)^2) + } + th <- t_grid * h phi_K <- ifelse(abs(th) <= 1, (1 - th^2)^3, 0) integrand <- phi_K * phi_Y / ifelse(abs(phi_U) > 1e-10, phi_U, complex(real = Inf)) f_hat <- numeric(length(grid)) for (i in seq_along(grid)) { - f_hat[i] <- Re(sum(exp(-1i * T * grid[i]) * integrand)) * dt / (2 * pi) + f_hat[i] <- Re(sum(exp(-1i * t_grid * grid[i]) * integrand)) * dt / (2 * pi) } f_hat <- pmax(f_hat, 0) - list(estimate = f_hat, grid = grid, bandwidth = h, - sigma_u = as.numeric(sigma_u), noise = noise, n = n, - method = "Fourier deconvolution density (sinc kernel)") + list( + estimate = f_hat, grid = grid, bandwidth = h, + sigma_u = as.numeric(sigma_u), noise = noise, n = n, + method = "Fourier deconvolution density (sinc kernel)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzn2 #' @keywords internal #' @export -horowitz_deconvolution <- hrzn2 +morie_horowitz_deconvolution <- hrzn2 diff --git a/r-package/morie/R/hrzp1.R b/r-package/morie/R/hrzp1.R index aaed18cc32..266331854a 100644 --- a/r-package/morie/R/hrzp1.R +++ b/r-package/morie/R/hrzp1.R @@ -13,31 +13,41 @@ hrzp1 <- function(x, y, z, bandwidth = NULL) { X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) Z <- if (is.null(dim(z))) matrix(z, ncol = 1) else as.matrix(z) n <- length(y) - if (n < 5 || nrow(X) != n || nrow(Z) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "PLR (insufficient data)")) + if (n < 5 || nrow(X) != n || nrow(Z) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "PLR (insufficient data)" + )) + } h <- if (is.null(bandwidth)) .hrz_silverman(Z[, 1]) else as.numeric(bandwidth) if (h <= 0) h <- max(.hrz_silverman(Z[, 1]), 1e-6) Zs <- if (ncol(Z) == 1) Z[, 1] else Z mY <- .hrz_nw_loo(Zs, y, h) - mX <- sapply(seq_len(ncol(X)), function(j) .hrz_nw_loo(Zs, X[, j], h)) + mX <- vapply( + seq_len(ncol(X)), function(j) .hrz_nw_loo(Zs, X[, j], h), + numeric(NROW(Zs)) + ) if (is.null(dim(mX))) mX <- matrix(mX, ncol = ncol(X)) - rY <- y - mY; rX <- X - mX + rY <- y - mY + rX <- X - mX beta <- tryCatch(MASS::ginv(t(rX) %*% rX) %*% (t(rX) %*% rY), - error = function(e) rep(NA_real_, ncol(X))) + error = function(e) rep(NA_real_, ncol(X)) + ) resid <- rY - rX %*% beta bread <- MASS::ginv(t(rX) %*% rX) meat <- t(rX) %*% (rX * as.numeric(resid)^2) cov_m <- bread %*% meat %*% bread se <- sqrt(pmax(diag(cov_m), 0)) - list(estimate = if (length(beta) == 1) as.numeric(beta) else as.numeric(beta), - se = if (length(se) == 1) as.numeric(se) else as.numeric(se), - bandwidth = h, n = n, - method = "Robinson (1988) partially-linear regression") + list( + estimate = if (length(beta) == 1) as.numeric(beta) else as.numeric(beta), + se = if (length(se) == 1) as.numeric(se) else as.numeric(se), + bandwidth = h, n = n, + method = "Robinson (1988) partially-linear regression" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzp1 #' @keywords internal #' @export -horowitz_plr_estimator <- hrzp1 +morie_horowitz_plr_estimator <- hrzp1 diff --git a/r-package/morie/R/hrzp2.R b/r-package/morie/R/hrzp2.R index cab22dd735..9d0cb25da7 100644 --- a/r-package/morie/R/hrzp2.R +++ b/r-package/morie/R/hrzp2.R @@ -8,20 +8,27 @@ #' @return Named list with estimate (h), n, sigma, c, method. #' @keywords internal hrzp2 <- function(x, y, c = 1.06) { - x <- as.numeric(x); n <- length(x) - if (n < 5) return(list(estimate = NA_real_, n = n, - method = "plr-bandwidth (insufficient data)")) + x <- as.numeric(x) + n <- length(x) + if (n < 5) { + return(list( + estimate = NA_real_, n = n, + method = "plr-bandwidth (insufficient data)" + )) + } s <- stats::sd(x) iqr <- diff(stats::quantile(x, c(0.25, 0.75), na.rm = TRUE)) sigma <- if (iqr > 0) min(s, iqr / 1.349) else s if (sigma <= 0) sigma <- max(s, 1e-6) - h <- as.numeric(c * sigma * n ^ (-1/5)) - list(estimate = h, n = n, sigma = as.numeric(sigma), c = c, - method = "Silverman h = c * sigma * n^(-1/5)") + h <- as.numeric(c * sigma * n^(-1 / 5)) + list( + estimate = h, n = n, sigma = as.numeric(sigma), c = c, + method = "Silverman h = c * sigma * n^(-1/5)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzp2 #' @keywords internal #' @export -horowitz_plr_bandwidth <- hrzp2 +morie_horowitz_plr_bandwidth <- hrzp2 diff --git a/r-package/morie/R/hrzq1.R b/r-package/morie/R/hrzq1.R index 48b39d0789..924cbd8b75 100644 --- a/r-package/morie/R/hrzq1.R +++ b/r-package/morie/R/hrzq1.R @@ -10,35 +10,46 @@ hrzq1 <- function(x, y, tau = 0.5) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p) || !(tau > 0 && tau < 1)) - return(list(estimate = rep(NA_real_, p), se = rep(NA_real_, p), - n = n, tau = tau, - method = "QReg (insufficient data or invalid tau)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p) || !(tau > 0 && tau < 1)) { + return(list( + estimate = rep(NA_real_, p), se = rep(NA_real_, p), + n = n, tau = tau, + method = "QReg (insufficient data or invalid tau)" + )) + } has_int <- isTRUE(all(X[, 1] == 1)) Xp <- if (!has_int) cbind(1, X) else X beta <- .hrz_qreg_irls(Xp, y, tau) r <- y - as.numeric(Xp %*% beta) - h <- (stats::qnorm(1 - 0.05)^(2/3)) * + h <- (stats::qnorm(1 - 0.05)^(2 / 3)) * ((1.5 * stats::dnorm(stats::qnorm(tau))^2) / - (2 * stats::qnorm(tau)^2 + 1))^(1/3) * n^(-1/3) + (2 * stats::qnorm(tau)^2 + 1))^(1 / 3) * n^(-1 / 3) h <- max(h, 1e-3) - f0 <- mean(abs(r) < h) / (2 * h); if (f0 < 1e-6) f0 <- 1e-6 + f0 <- mean(abs(r) < h) / (2 * h) + if (f0 < 1e-6) f0 <- 1e-6 cov_m <- (tau * (1 - tau) / f0^2) * MASS::ginv(t(Xp) %*% Xp) se_all <- sqrt(pmax(diag(cov_m), 0)) if (!has_int) { beta_out <- if (ncol(Xp) > 1) beta[-1] else beta se_out <- if (ncol(Xp) > 1) se_all[-1] else se_all intercept <- as.numeric(beta[1]) - } else { beta_out <- beta; se_out <- se_all; intercept <- NULL } - list(estimate = if (length(beta_out) == 1) as.numeric(beta_out) else beta_out, - se = if (length(se_out) == 1) as.numeric(se_out) else se_out, - intercept = intercept, tau = tau, n = n, - method = "Koenker-Bassett quantile regression (IRLS)") + } else { + beta_out <- beta + se_out <- se_all + intercept <- NULL + } + list( + estimate = if (length(beta_out) == 1) as.numeric(beta_out) else beta_out, + se = if (length(se_out) == 1) as.numeric(se_out) else se_out, + intercept = intercept, tau = tau, n = n, + method = "Koenker-Bassett quantile regression (IRLS)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzq1 #' @keywords internal #' @export -horowitz_quantile_regression <- hrzq1 +morie_horowitz_quantile_regression <- hrzq1 diff --git a/r-package/morie/R/hrzs1.R b/r-package/morie/R/hrzs1.R index a14a159e53..b05d75b5b9 100644 --- a/r-package/morie/R/hrzs1.R +++ b/r-package/morie/R/hrzs1.R @@ -9,22 +9,29 @@ #' @return Named list with estimate, se, selection_correction, n, n_selected, method. #' @keywords internal hrzs1 <- function(x, y, z, d) { - y <- as.numeric(y); d <- as.numeric(d) + y <- as.numeric(y) + d <- as.numeric(d) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) Z <- if (is.null(dim(z))) matrix(z, ncol = 1) else as.matrix(z) n <- length(y) - if (n < 20 || nrow(X) != n || nrow(Z) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "sample-selection (insufficient data)")) + if (n < 20 || nrow(X) != n || nrow(Z) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "sample-selection (insufficient data)" + )) + } Zc <- if (all(Z[, 1] == 1)) Z else cbind(1, Z) gamma <- .hrz_probit_newton(d, Zc) eta <- as.numeric(Zc %*% gamma) mills <- stats::dnorm(eta) / pmax(stats::pnorm(eta), 1e-8) Xc <- if (all(X[, 1] == 1)) X else cbind(1, X) sel <- d > 0.5 - if (sum(sel) < max(10, ncol(Xc) + 2)) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "sample-selection (too few selected)")) + if (sum(sel) < max(10, ncol(Xc) + 2)) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "sample-selection (too few selected)" + )) + } M <- cbind(Xc[sel, , drop = FALSE], mills[sel]) yy <- y[sel] coef <- as.numeric(MASS::ginv(t(M) %*% M) %*% (t(M) %*% yy)) @@ -34,15 +41,17 @@ hrzs1 <- function(x, y, z, d) { sigma2 <- mean(resid^2) cov_m <- sigma2 * MASS::ginv(t(M) %*% M) se_all <- sqrt(pmax(diag(cov_m), 0)) - list(estimate = as.numeric(beta), - se = as.numeric(se_all[seq_len(ncol(Xc))]), - selection_correction = as.numeric(rho_sigma), n = n, - n_selected = as.integer(sum(sel)), - method = "Semiparametric Heckman/Powell-Newey-Vella sample selection") + list( + estimate = as.numeric(beta), + se = as.numeric(se_all[seq_len(ncol(Xc))]), + selection_correction = as.numeric(rho_sigma), n = n, + n_selected = as.integer(sum(sel)), + method = "Semiparametric Heckman/Powell-Newey-Vella sample selection" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzs1 #' @keywords internal #' @export -horowitz_sample_selection <- hrzs1 +morie_horowitz_sample_selection <- hrzs1 diff --git a/r-package/morie/R/hrzt1.R b/r-package/morie/R/hrzt1.R index 3790817f02..cece217e8a 100644 --- a/r-package/morie/R/hrzt1.R +++ b/r-package/morie/R/hrzt1.R @@ -10,51 +10,70 @@ #' @return Named list with estimate, se, att, atu, bandwidth, n, method. #' @keywords internal hrzt1 <- function(x, y, treatment, bandwidth = NULL, .bootstrap = TRUE) { - y <- as.numeric(y); D <- as.numeric(treatment) + y <- as.numeric(y) + D <- as.numeric(treatment) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) n <- length(y) - if (n < 30 || length(D) != n || nrow(X) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "kernel-matching ATE (insufficient data)")) + if (n < 30 || length(D) != n || nrow(X) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "kernel-matching ATE (insufficient data)" + )) + } Xp <- if (!isTRUE(all(X[, 1] == 1))) cbind(1, X) else X e <- .hrz_logit_newton(D, Xp) e <- pmin(pmax(e, 1e-6), 1 - 1e-6) h <- if (is.null(bandwidth)) max(.hrz_silverman(e), 1e-3) else as.numeric(bandwidth) - t_idx <- which(D > 0.5); c_idx <- which(D < 0.5) - if (length(t_idx) < 2 || length(c_idx) < 2) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "kernel-matching ATE (one arm empty)")) - e_t <- e[t_idx]; e_c <- e[c_idx] - u <- outer(e_t, e_c, `-`) / h; K <- exp(-0.5 * u^2) + t_idx <- which(D > 0.5) + c_idx <- which(D < 0.5) + if (length(t_idx) < 2 || length(c_idx) < 2) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "kernel-matching ATE (one arm empty)" + )) + } + e_t <- e[t_idx] + e_c <- e[c_idx] + u <- outer(e_t, e_c, `-`) / h + K <- exp(-0.5 * u^2) w <- K / pmax(rowSums(K), 1e-12) cf_t <- as.numeric(w %*% y[c_idx]) - u2 <- outer(e_c, e_t, `-`) / h; K2 <- exp(-0.5 * u2^2) + u2 <- outer(e_c, e_t, `-`) / h + K2 <- exp(-0.5 * u2^2) w2 <- K2 / pmax(rowSums(K2), 1e-12) cf_c <- as.numeric(w2 %*% y[t_idx]) - att <- mean(y[t_idx] - cf_t); atu <- mean(cf_c - y[c_idx]) + att <- mean(y[t_idx] - cf_t) + atu <- mean(cf_c - y[c_idx]) ate <- (length(t_idx) * att + length(c_idx) * atu) / n # Bootstrap SE (guarded against recursive blow-up) se <- NA_real_ if (.bootstrap) { - set.seed(0); B <- 50; boot <- numeric(B) + set.seed(0) + B <- 50 + boot <- numeric(B) for (b in 1:B) { idx <- sample.int(n, n, replace = TRUE) - sub <- tryCatch(hrzt1(X[idx, , drop = FALSE], y[idx], D[idx], - bandwidth = h, .bootstrap = FALSE), - error = function(e) list(estimate = ate)) + sub <- tryCatch( + hrzt1(X[idx, , drop = FALSE], y[idx], D[idx], + bandwidth = h, .bootstrap = FALSE + ), + error = function(e) list(estimate = ate) + ) boot[b] <- if (is.numeric(sub$estimate) && !is.na(sub$estimate)) sub$estimate else ate } se <- as.numeric(stats::sd(boot)) } - list(estimate = as.numeric(ate), se = se, - att = att, atu = atu, bandwidth = h, n = n, - n_treated = as.integer(length(t_idx)), - n_control = as.integer(length(c_idx)), - method = "Kernel-matching ATE (Heckman-Ichimura-Todd)") + list( + estimate = as.numeric(ate), se = se, + att = att, atu = atu, bandwidth = h, n = n, + n_treated = as.integer(length(t_idx)), + n_control = as.integer(length(c_idx)), + method = "Kernel-matching ATE (Heckman-Ichimura-Todd)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzt1 #' @keywords internal #' @export -horowitz_treatment_effect <- hrzt1 +morie_horowitz_treatment_effect <- hrzt1 diff --git a/r-package/morie/R/hrzt2.R b/r-package/morie/R/hrzt2.R index 38840e3e38..0fdb0d21d7 100644 --- a/r-package/morie/R/hrzt2.R +++ b/r-package/morie/R/hrzt2.R @@ -9,36 +9,56 @@ #' @return Named list with estimate, se, first_stage, reduced_form, n, method. #' @keywords internal hrzt2 <- function(x, y, z, treatment) { - y <- as.numeric(y); z <- as.numeric(z); D <- as.numeric(treatment) + y <- as.numeric(y) + z <- as.numeric(z) + D <- as.numeric(treatment) n <- length(y) - if (n < 20 || length(z) != n || length(D) != n) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "LATE (insufficient data)")) + if (n < 20 || length(z) != n || length(D) != n) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "LATE (insufficient data)" + )) + } uniq <- unique(z) - z_bin <- if (length(uniq) > 2) as.numeric(z > stats::median(z)) else + z_bin <- if (length(uniq) > 2) { + as.numeric(z > stats::median(z)) + } else { as.numeric(z == max(uniq)) - n1 <- sum(z_bin > 0.5); n0 <- sum(z_bin < 0.5) - if (n1 < 5 || n0 < 5) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "LATE (one arm of Z empty)")) - Y1 <- mean(y[z_bin > 0.5]); Y0 <- mean(y[z_bin < 0.5]) - D1 <- mean(D[z_bin > 0.5]); D0 <- mean(D[z_bin < 0.5]) - num <- Y1 - Y0; den <- D1 - D0 - if (abs(den) < 1e-8) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "LATE (weak instrument)")) + } + n1 <- sum(z_bin > 0.5) + n0 <- sum(z_bin < 0.5) + if (n1 < 5 || n0 < 5) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "LATE (one arm of Z empty)" + )) + } + Y1 <- mean(y[z_bin > 0.5]) + Y0 <- mean(y[z_bin < 0.5]) + D1 <- mean(D[z_bin > 0.5]) + D0 <- mean(D[z_bin < 0.5]) + num <- Y1 - Y0 + den <- D1 - D0 + if (abs(den) < 1e-8) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "LATE (weak instrument)" + )) + } late <- num / den vY <- stats::var(y[z_bin > 0.5]) / n1 + stats::var(y[z_bin < 0.5]) / n0 vD <- stats::var(D[z_bin > 0.5]) / n1 + stats::var(D[z_bin < 0.5]) / n0 v_late <- (vY + late^2 * vD) / den^2 - list(estimate = as.numeric(late), se = sqrt(max(v_late, 0)), - first_stage = as.numeric(den), reduced_form = as.numeric(num), - n = n, - method = "IV Wald estimator (Imbens-Angrist LATE)") + list( + estimate = as.numeric(late), se = sqrt(max(v_late, 0)), + first_stage = as.numeric(den), reduced_form = as.numeric(num), + n = n, + method = "IV Wald estimator (Imbens-Angrist LATE)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzt2 #' @keywords internal #' @export -horowitz_local_ate <- hrzt2 +morie_horowitz_local_ate <- hrzt2 diff --git a/r-package/morie/R/hrzw1.R b/r-package/morie/R/hrzw1.R index 84ae9ebeb5..1750b99b7d 100644 --- a/r-package/morie/R/hrzw1.R +++ b/r-package/morie/R/hrzw1.R @@ -12,13 +12,20 @@ hrzw1 <- function(x, y, residuals = NULL, B = 500, seed = 0) { y <- as.numeric(y) X <- if (is.null(dim(x))) matrix(x, ncol = 1) else as.matrix(x) - n <- nrow(X); p <- ncol(X) - if (n < max(10, 2 * p)) - return(list(estimate = NA_real_, se = NA_real_, n = n, - method = "wild-bootstrap (insufficient data)")) + n <- nrow(X) + p <- ncol(X) + if (n < max(10, 2 * p)) { + return(list( + estimate = NA_real_, se = NA_real_, n = n, + method = "wild-bootstrap (insufficient data)" + )) + } beta0 <- as.numeric(MASS::ginv(t(X) %*% X) %*% (t(X) %*% y)) - res <- if (is.null(residuals)) y - as.numeric(X %*% beta0) - else as.numeric(residuals) + res <- if (is.null(residuals)) { + y - as.numeric(X %*% beta0) + } else { + as.numeric(residuals) + } set.seed(seed) XtX_inv <- MASS::ginv(t(X) %*% X) boot <- matrix(0, B, p) @@ -27,20 +34,23 @@ hrzw1 <- function(x, y, residuals = NULL, B = 500, seed = 0) { y_star <- as.numeric(X %*% beta0) + res * v boot[b, ] <- as.numeric(XtX_inv %*% (t(X) %*% y_star)) } - mean_b <- colMeans(boot); se <- apply(boot, 2, stats::sd) + mean_b <- colMeans(boot) + se <- apply(boot, 2, stats::sd) ci_lo <- apply(boot, 2, stats::quantile, 0.025) ci_hi <- apply(boot, 2, stats::quantile, 0.975) - list(estimate = if (p == 1) as.numeric(beta0) else beta0, - se = if (p == 1) as.numeric(se) else se, - ci_lower = if (p == 1) as.numeric(ci_lo) else ci_lo, - ci_upper = if (p == 1) as.numeric(ci_hi) else ci_hi, - boot_mean = if (p == 1) as.numeric(mean_b) else mean_b, - B = B, n = n, - method = "Rademacher wild bootstrap (Mammen 1993)") + list( + estimate = if (p == 1) as.numeric(beta0) else beta0, + se = if (p == 1) as.numeric(se) else se, + ci_lower = if (p == 1) as.numeric(ci_lo) else ci_lo, + ci_upper = if (p == 1) as.numeric(ci_hi) else ci_hi, + boot_mean = if (p == 1) as.numeric(mean_b) else mean_b, + B = B, n = n, + method = "Rademacher wild bootstrap (Mammen 1993)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzw1 #' @keywords internal #' @export -horowitz_wild_bootstrap <- hrzw1 +morie_horowitz_wild_bootstrap <- hrzw1 diff --git a/r-package/morie/R/hrzw2.R b/r-package/morie/R/hrzw2.R index bcad72e3d7..79e296bbcc 100644 --- a/r-package/morie/R/hrzw2.R +++ b/r-package/morie/R/hrzw2.R @@ -10,13 +10,20 @@ #' @return Named list with estimate (h_star), h_silverman, mise_curve, h_grid, n, B, method. #' @keywords internal hrzw2 <- function(x, y, B = 50, n_h = 15, seed = 0) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - if (n < 30 || length(y) != n) - return(list(estimate = NA_real_, n = n, - method = "bw-bootstrap (insufficient data)")) + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + if (n < 30 || length(y) != n) { + return(list( + estimate = NA_real_, n = n, + method = "bw-bootstrap (insufficient data)" + )) + } nw_fit <- function(x_train, y_train, x_eval, h) { u <- outer(x_eval, x_train, `-`) / h - w <- exp(-0.5 * u^2); s <- rowSums(w); safe <- ifelse(s > 0, s, 1) + w <- exp(-0.5 * u^2) + s <- rowSums(w) + safe <- ifelse(s > 0, s, 1) as.numeric((w %*% y_train) / safe) } h_sil <- .hrz_silverman(x) @@ -36,13 +43,15 @@ hrzw2 <- function(x, y, B = 50, n_h = 15, seed = 0) { mise[j] <- ise / B } j_star <- which.min(mise) - list(estimate = as.numeric(h_grid[j_star]), h_silverman = as.numeric(h_sil), - mise_curve = mise, h_grid = h_grid, n = n, B = B, - method = "Wild-bootstrap MISE bandwidth selection (Faraway-Jhun)") + list( + estimate = as.numeric(h_grid[j_star]), h_silverman = as.numeric(h_sil), + mise_curve = mise, h_grid = h_grid, n = n, B = B, + method = "Wild-bootstrap MISE bandwidth selection (Faraway-Jhun)" + ) } # canonical full-name alias (Py<->R API parity) #' @rdname hrzw2 #' @keywords internal #' @export -horowitz_bandwidth_bootstrap <- hrzw2 +morie_horowitz_bandwidth_bootstrap <- hrzw2 diff --git a/r-package/morie/R/idlpt.R b/r-package/morie/R/idlpt.R index 1f38874749..d6df615b8e 100644 --- a/r-package/morie/R/idlpt.R +++ b/r-package/morie/R/idlpt.R @@ -12,10 +12,8 @@ #' @return Named list with `ideal_points`, `n_respondents`, `k`, #' `mean_stim_dist`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export idlpt <- function(X_r, X_s = NULL) { Xr <- if (is.matrix(X_r)) X_r else matrix(as.numeric(X_r), ncol = 1L) @@ -25,22 +23,27 @@ idlpt <- function(X_r, X_s = NULL) { Xs <- if (is.matrix(X_s)) X_s else matrix(as.numeric(X_s), ncol = 1L) dvec <- numeric(nrow(Xr) * nrow(Xs)) idx <- 1L - for (i in seq_len(nrow(Xr))) for (j in seq_len(nrow(Xs))) { - dvec[idx] <- sqrt(sum((Xr[i, ] - Xs[j, ])^2)); idx <- idx + 1L + for (i in seq_len(nrow(Xr))) { + for (j in seq_len(nrow(Xs))) { + dvec[idx] <- sqrt(sum((Xr[i, ] - Xs[j, ])^2)) + idx <- idx + 1L + } } msd <- mean(dvec) } - list(ideal_points = out, n_respondents = nrow(Xr), - k = ncol(Xr), mean_stim_dist = msd, - method = "ideal_point_recovery") + list( + ideal_points = out, n_respondents = nrow(Xr), + k = ncol(Xr), mean_stim_dist = msd, + method = "morie_ideal_point_recovery" + ) } #' @keywords internal #' @rdname idlpt #' @export -ideal_point_recovery <- idlpt +morie_ideal_point_recovery <- idlpt #' @rdname idlpt #' @keywords internal #' @export -ideal_point_model <- idlpt +morie_ideal_point_model <- idlpt diff --git a/r-package/morie/R/impsm.R b/r-package/morie/R/impsm.R index baff659063..d43a338d68 100644 --- a/r-package/morie/R/impsm.R +++ b/r-package/morie/R/impsm.R @@ -9,23 +9,27 @@ #' @return list: estimate, estimate_sn, se, ess, n, method. #' @keywords internal impsm <- function(x, h = NULL, p = NULL, q = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 1L) + x <- as.numeric(x) + n <- length(x) + if (n < 1L) { return(list(estimate = NA_real_, n = 0L, method = "impsm (empty)")) + } if (is.null(h)) h <- function(z) z if (is.null(p)) p <- function(z) stats::dnorm(z) if (is.null(q)) q <- function(z) stats::dnorm(z) hx <- vapply(x, h, numeric(1)) px <- vapply(x, p, numeric(1)) qx <- vapply(x, q, numeric(1)) - w <- px / qx - est <- mean(w * hx) + w <- px / qx + est <- mean(w * hx) est_sn <- sum(w * hx) / sum(w) - se <- stats::sd(w * hx) / sqrt(n) - ess <- sum(w)^2 / sum(w^2) - list(estimate = as.numeric(est), estimate_sn = as.numeric(est_sn), - se = as.numeric(se), ess = as.numeric(ess), n = as.integer(n), - method = "Importance sampling (Geweke 1989)") + se <- stats::sd(w * hx) / sqrt(n) + ess <- sum(w)^2 / sum(w^2) + list( + estimate = as.numeric(est), estimate_sn = as.numeric(est_sn), + se = as.numeric(se), ess = as.numeric(ess), n = as.integer(n), + method = "Importance sampling (Geweke 1989)" + ) } # CANONICAL TEST @@ -37,4 +41,4 @@ impsm <- function(x, h = NULL, p = NULL, q = NULL) { #' @rdname impsm #' @keywords internal #' @export -importance_sampling <- impsm +morie_importance_sampling <- impsm diff --git a/r-package/morie/R/indkr.R b/r-package/morie/R/indkr.R index 4d63cc7b23..ff37dadfef 100644 --- a/r-package/morie/R/indkr.R +++ b/r-package/morie/R/indkr.R @@ -12,45 +12,56 @@ #' @return Named list: estimate, threshold, n, method. #' @references Journel (1983); Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' indkr(x = rnorm(50), coords = matrix(runif(100), 50, 2), threshold = 0.5) #' @export indkr <- function(x, coords, threshold, target = NULL, nugget = 0, sill = 0.25, range_ = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } if (is.null(target)) target <- coords - target <- if (is.matrix(target)) target else + target <- if (is.matrix(target)) { + target + } else { matrix(as.numeric(unlist(target)), ncol = ncol(coords)) + } if (nrow(coords) != n) stop("coords rows must match length(x)") if (ncol(target) != ncol(coords)) stop("target/coords dim mismatch") I_obs <- as.numeric(x <= threshold) - c0 <- nugget; c1 <- sill - nugget; a <- range_ + c0 <- nugget + c1 <- sill - nugget + a <- range_ D <- as.matrix(stats::dist(coords)) cov_fn <- function(h) c1 * exp(-h / a) + ifelse(h == 0, c0, 0) C <- cov_fn(D) A <- matrix(0, n + 1, n + 1) - A[1:n, 1:n] <- C; A[1:n, n + 1] <- 1; A[n + 1, 1:n] <- 1 + A[1:n, 1:n] <- C + A[1:n, n + 1] <- 1 + A[n + 1, 1:n] <- 1 m <- nrow(target) probs <- numeric(m) for (k in seq_len(m)) { - d0 <- sqrt(colSums((t(coords) - target[k, ]) ^ 2)) + d0 <- sqrt(colSums((t(coords) - target[k, ])^2)) c_vec <- cov_fn(d0) rhs <- c(c_vec, 1) sol <- tryCatch(solve(A, rhs), - error = function(e) qr.solve(A, rhs)) + error = function(e) qr.solve(A, rhs) + ) lam <- sol[1:n] probs[k] <- min(max(sum(lam * I_obs), 0), 1) } - list(estimate = if (m == 1) probs[1] else probs, - threshold = threshold, n = n, - method = "Indicator kriging (ordinary, exp. cov)") + list( + estimate = if (m == 1) probs[1] else probs, + threshold = threshold, n = n, + method = "Indicator kriging (ordinary, exp. cov)" + ) } #' @rdname indkr #' @keywords internal #' @export -indicator_kriging <- indkr +morie_indicator_kriging <- indkr diff --git a/r-package/morie/R/inference.R b/r-package/morie/R/inference.R index 7760748619..f5ff50a78b 100644 --- a/r-package/morie/R/inference.R +++ b/r-package/morie/R/inference.R @@ -20,23 +20,25 @@ NULL #' @param x2 Numeric vector (group 2). #' @param equal_var Assume equal variances? Default `FALSE` (Welch test). #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. -#' @return Named list: `t`, `df`, `p_value`, `ci_diff`, `cohens_d`. +#' @return Named list: `t`, `df`, `p_value`, `ci_diff`, `morie_cohens_d`. #' @export #' @examples -#' two_sample_t_test(rnorm(50, 0.5), rnorm(50, 0)) -two_sample_t_test <- function(x1, x2, - equal_var = FALSE, - alternative = c("two.sided", "greater", "less")) { +#' morie_two_sample_t_test(rnorm(50, 0.5), rnorm(50, 0)) +morie_two_sample_t_test <- function(x1, x2, + equal_var = FALSE, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) - result <- stats::t.test(x1, x2, var.equal = equal_var, - alternative = alternative) - d <- cohens_d(x1, x2) + result <- stats::t.test(x1, x2, + var.equal = equal_var, + alternative = alternative + ) + d <- morie_cohens_d(x1, x2) list( t = as.numeric(result$statistic), df = as.numeric(result$parameter), p_value = result$p.value, ci_diff = as.numeric(result$conf.int), - cohens_d = d + morie_cohens_d = d ) } @@ -47,13 +49,10 @@ two_sample_t_test <- function(x1, x2, #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. #' @return Named list: `t`, `df`, `p_value`, `ci`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_one_sample_t_test(x = rnorm(50)) #' @export -one_sample_t_test <- function(x, mu0 = 0, - alternative = c("two.sided", "greater", "less")) { +morie_one_sample_t_test <- function(x, mu0 = 0, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) result <- stats::t.test(x, mu = mu0, alternative = alternative) list( @@ -71,13 +70,11 @@ one_sample_t_test <- function(x, mu0 = 0, #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. #' @return Named list: `t`, `df`, `p_value`, `ci_diff`, `mean_diff`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -paired_t_test <- function(x1, x2, - alternative = c("two.sided", "greater", "less")) { +morie_paired_t_test <- function(x1, x2, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) result <- stats::t.test(x1, x2, paired = TRUE, alternative = alternative) list( @@ -93,26 +90,28 @@ paired_t_test <- function(x1, x2, #' #' @param observed Observed counts (matrix for independence, vector for GOF). #' @param expected Expected counts for GOF (optional; uniform if NULL). -#' @return Named list: `chi_sq`, `df`, `p_value`, `cramers_v`. +#' @return Named list: `chi_sq`, `df`, `p_value`, `morie_cramers_v`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -chi_square_test <- function(observed, expected = NULL) { +morie_chi_square_test <- function(observed, expected = NULL) { if (is.matrix(observed) || is.data.frame(observed)) { result <- stats::chisq.test(observed) - v <- cramers_v(as.matrix(observed)) + v <- morie_cramers_v(as.matrix(observed)) } else { - result <- stats::chisq.test(observed, p = expected) + result <- if (is.null(expected)) { + stats::chisq.test(observed) + } else { + stats::chisq.test(observed, p = expected) + } v <- NA_real_ } list( chi_sq = as.numeric(result$statistic), df = as.numeric(result$parameter), p_value = result$p.value, - cramers_v = v + morie_cramers_v = v ) } @@ -122,13 +121,11 @@ chi_square_test <- function(observed, expected = NULL) { #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. #' @return Named list: `odds_ratio`, `ci`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -fisher_exact_test <- function(table_2x2, - alternative = c("two.sided", "greater", "less")) { +morie_fisher_exact_test <- function(table_2x2, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) result <- stats::fisher.test(as.matrix(table_2x2), alternative = alternative) list( @@ -142,11 +139,11 @@ fisher_exact_test <- function(table_2x2, #' #' @param ... Numeric vectors, one per group. #' @return Named list: `F`, `df_between`, `df_within`, `p_value`, -#' `eta_squared`. +#' `morie_eta_squared`. #' @export #' @examples -#' anova_one_way(rnorm(30, 0), rnorm(30, 0.5), rnorm(30, 1)) -anova_one_way <- function(...) { +#' morie_anova_one_way(rnorm(30, 0), rnorm(30, 0.5), rnorm(30, 1)) +morie_anova_one_way <- function(...) { groups <- list(...) if (length(groups) < 2) stop("At least two groups required.") df_long <- do.call(rbind, lapply(seq_along(groups), function(i) { @@ -155,16 +152,16 @@ anova_one_way <- function(...) { fit <- stats::aov(y ~ grp, data = df_long) s <- summary(fit)[[1]] f_val <- s["grp", "F value"] - df_b <- s["grp", "Df"] - df_w <- s["Residuals", "Df"] - ss_b <- s["grp", "Sum Sq"] - ss_t <- sum(s[, "Sum Sq"]) + df_b <- s["grp", "Df"] + df_w <- s["Residuals", "Df"] + ss_b <- s["grp", "Sum Sq"] + ss_t <- sum(s[, "Sum Sq"]) list( F = f_val, df_between = df_b, - df_within = df_w, - p_value = s["grp", "Pr(>F)"], - eta_squared = ss_b / ss_t + df_within = df_w, + p_value = s["grp", "Pr(>F)"], + morie_eta_squared = ss_b / ss_t ) } @@ -173,12 +170,10 @@ anova_one_way <- function(...) { #' @param ... Numeric vectors, one per group. #' @return Named list: `H`, `df`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -kruskal_wallis_test <- function(...) { +morie_kruskal_wallis_test <- function(...) { groups <- list(...) df_long <- do.call(rbind, lapply(seq_along(groups), function(i) { data.frame(y = groups[[i]], grp = factor(i)) @@ -198,16 +193,16 @@ kruskal_wallis_test <- function(...) { #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. #' @return Named list: `W`, `p_value`, `r` (effect size). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -mann_whitney_test <- function(x1, x2, - alternative = c("two.sided", "greater", "less")) { +morie_mann_whitney_test <- function(x1, x2, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) - result <- stats::wilcox.test(x1, x2, alternative = alternative, - exact = FALSE) + result <- stats::wilcox.test(x1, x2, + alternative = alternative, + exact = FALSE + ) n <- length(x1) * length(x2) r_effect <- abs(stats::qnorm(result$p.value / 2)) / sqrt(n) list(W = as.numeric(result$statistic), p_value = result$p.value, r = r_effect) @@ -220,16 +215,16 @@ mann_whitney_test <- function(x1, x2, #' @param alternative `"two.sided"`, `"greater"`, or `"less"`. #' @return Named list: `V`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -wilcoxon_signed_rank_test <- function(x1, x2, - alternative = c("two.sided", "greater", "less")) { +morie_wilcoxon_signed_rank_test <- function(x1, x2, + alternative = c("two.sided", "greater", "less")) { alternative <- match.arg(alternative) - result <- stats::wilcox.test(x1, x2, paired = TRUE, - alternative = alternative, exact = FALSE) + result <- stats::wilcox.test(x1, x2, + paired = TRUE, + alternative = alternative, exact = FALSE + ) list(V = as.numeric(result$statistic), p_value = result$p.value) } @@ -239,12 +234,9 @@ wilcoxon_signed_rank_test <- function(x1, x2, #' @param alpha Significance level for the `is_normal` flag (default 0.05). #' @return Named list: `W`, `p_value`, `is_normal`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_shapiro_wilk_test(x = rnorm(50)) #' @export -shapiro_wilk_test <- function(x, alpha = 0.05) { +morie_shapiro_wilk_test <- function(x, alpha = 0.05) { result <- stats::shapiro.test(x) list( W = as.numeric(result$statistic), @@ -258,12 +250,10 @@ shapiro_wilk_test <- function(x, alpha = 0.05) { #' @param ... Numeric vectors, one per group. #' @return Named list: `F`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -levene_test <- function(...) { +morie_levene_test <- function(...) { groups <- list(...) df_long <- do.call(rbind, lapply(seq_along(groups), function(i) { data.frame(y = groups[[i]], grp = factor(i)) @@ -290,9 +280,9 @@ levene_test <- function(...) { #' @return Named list: `p_hat`, `ci_lower`, `ci_upper`. #' @export #' @examples -#' proportion_ci(35, 100) -proportion_ci <- function(successes, n, alpha = 0.05, - method = c("wilson", "exact", "wald")) { +#' morie_proportion_ci(35, 100) +morie_proportion_ci <- function(successes, n, alpha = 0.05, + method = c("wilson", "exact", "wald")) { method <- match.arg(method) p <- successes / n z <- stats::qnorm(1 - alpha / 2) @@ -303,9 +293,11 @@ proportion_ci <- function(successes, n, alpha = 0.05, margin <- z * sqrt(p * (1 - p) / n + z^2 / (4 * n^2)) / denom ci <- c(centre - margin, centre + margin) } else if (method == "exact") { - ci <- stats::qbeta(c(alpha / 2, 1 - alpha / 2), - c(successes, successes + 1), - c(n - successes + 1, n - successes)) + ci <- stats::qbeta( + c(alpha / 2, 1 - alpha / 2), + c(successes, successes + 1), + c(n - successes + 1, n - successes) + ) } else { margin <- z * sqrt(p * (1 - p) / n) ci <- c(p - margin, p + margin) @@ -320,19 +312,17 @@ proportion_ci <- function(successes, n, alpha = 0.05, #' @param alpha Significance level. #' @return Named list: `or`, `ci_lower`, `ci_upper`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -odds_ratio_ci <- function(table_2x2, alpha = 0.05) { +morie_odds_ratio_ci <- function(table_2x2, alpha = 0.05) { m <- as.matrix(table_2x2) result <- stats::fisher.test(m) list( or = as.numeric(result$estimate), ci_lower = result$conf.int[1], ci_upper = result$conf.int[2], - p_value = result$p.value + p_value = result$p.value ) } @@ -342,18 +332,21 @@ odds_ratio_ci <- function(table_2x2, alpha = 0.05) { #' @param alpha Significance level. #' @return Named list: `rr`, `ci_lower`, `ci_upper`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -risk_ratio_ci <- function(table_2x2, alpha = 0.05) { +morie_risk_ratio_ci <- function(table_2x2, alpha = 0.05) { m <- as.matrix(table_2x2) - a <- m[1, 1]; b <- m[1, 2]; c <- m[2, 1]; d <- m[2, 2] - n1 <- a + b; n2 <- c + d - p1 <- a / n1; p2 <- c / n2 + a <- m[1, 1] + b <- m[1, 2] + c <- m[2, 1] + d <- m[2, 2] + n1 <- a + b + n2 <- c + d + p1 <- a / n1 + p2 <- c / n2 rr <- p1 / p2 - log_se <- sqrt(1/a - 1/n1 + 1/c - 1/n2) + log_se <- sqrt(1 / a - 1 / n1 + 1 / c - 1 / n2) z <- stats::qnorm(1 - alpha / 2) list( rr = rr, @@ -368,16 +361,19 @@ risk_ratio_ci <- function(table_2x2, alpha = 0.05) { #' @param alpha Significance level. #' @return Named list: `rd`, `ci_lower`, `ci_upper`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -risk_difference_ci <- function(table_2x2, alpha = 0.05) { +morie_risk_difference_ci <- function(table_2x2, alpha = 0.05) { m <- as.matrix(table_2x2) - a <- m[1, 1]; b <- m[1, 2]; c <- m[2, 1]; d <- m[2, 2] - n1 <- a + b; n2 <- c + d - p1 <- a / n1; p2 <- c / n2 + a <- m[1, 1] + b <- m[1, 2] + c <- m[2, 1] + d <- m[2, 2] + n1 <- a + b + n2 <- c + d + p1 <- a / n1 + p2 <- c / n2 rd <- p1 - p2 z <- stats::qnorm(1 - alpha / 2) se <- sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2) @@ -400,34 +396,36 @@ risk_difference_ci <- function(table_2x2, alpha = 0.05) { #' @param pooled Use pooled SD (default `TRUE`). If `FALSE`, uses `sd(x2)`. #' @return Numeric Cohen's d. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -cohens_d <- function(x1, x2, pooled = TRUE) { - m1 <- mean(x1, na.rm = TRUE); m2 <- mean(x2, na.rm = TRUE) - n1 <- sum(!is.na(x1)); n2 <- sum(!is.na(x2)) - s1 <- stats::sd(x1, na.rm = TRUE); s2 <- stats::sd(x2, na.rm = TRUE) +morie_cohens_d <- function(x1, x2, pooled = TRUE) { + m1 <- mean(x1, na.rm = TRUE) + m2 <- mean(x2, na.rm = TRUE) + n1 <- sum(!is.na(x1)) + n2 <- sum(!is.na(x2)) + s1 <- stats::sd(x1, na.rm = TRUE) + s2 <- stats::sd(x2, na.rm = TRUE) sd_denom <- if (pooled) { sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)) - } else s2 + } else { + s2 + } (m1 - m2) / sd_denom } #' Hedges' g (bias-corrected Cohen's d) #' -#' @inheritParams cohens_d +#' @inheritParams morie_cohens_d #' @return Numeric Hedges' g. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -hedges_g <- function(x1, x2) { - d <- cohens_d(x1, x2, pooled = TRUE) - n1 <- sum(!is.na(x1)); n2 <- sum(!is.na(x2)) +morie_hedges_g <- function(x1, x2) { + d <- morie_cohens_d(x1, x2, pooled = TRUE) + n1 <- sum(!is.na(x1)) + n2 <- sum(!is.na(x2)) df <- n1 + n2 - 2 correction <- 1 - 3 / (4 * df - 1) d * correction @@ -440,45 +438,41 @@ hedges_g <- function(x1, x2) { #' @param df_within Degrees of freedom (denominator). #' @return Numeric eta-squared. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -eta_squared <- function(f_stat, df_between, df_within) { +morie_eta_squared <- function(f_stat, df_between, df_within) { ss_between <- f_stat * df_between - ss_total <- ss_between + df_within + ss_total <- ss_between + df_within ss_between / ss_total } #' Omega-squared (less biased than eta-squared) #' -#' @inheritParams eta_squared +#' @inheritParams morie_eta_squared #' @param n Total sample size. #' @return Numeric omega-squared. #' @export #' @examples -#' omega_squared(f_stat = 5.2, df_between = 2, df_within = 87, n = 90) -omega_squared <- function(f_stat, df_between, df_within, n) { +#' morie_omega_squared(f_stat = 5.2, df_between = 2, df_within = 87, n = 90) +morie_omega_squared <- function(f_stat, df_between, df_within, n) { (df_between * (f_stat - 1)) / (df_between * (f_stat - 1) + n) } #' Cramer's V for categorical association #' #' @param contingency_table A numeric matrix of observed counts. -#' @return Numeric Cramer's V in [0, 1]. +#' @return Numeric Cramer's V in the interval \[0, 1\]. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -cramers_v <- function(contingency_table) { +morie_cramers_v <- function(contingency_table) { m <- as.matrix(contingency_table) result <- stats::chisq.test(m, correct = FALSE) chi2 <- as.numeric(result$statistic) - n <- sum(m) - k <- min(nrow(m), ncol(m)) + n <- sum(m) + k <- min(nrow(m), ncol(m)) sqrt(chi2 / (n * (k - 1))) } @@ -488,12 +482,9 @@ cramers_v <- function(contingency_table) { #' @param y Numeric vector. #' @return Named list: `rho`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_spearman_rho(x = rnorm(50), y = rnorm(50)) #' @export -spearman_rho <- function(x, y) { +morie_spearman_rho <- function(x, y) { result <- stats::cor.test(x, y, method = "spearman", exact = FALSE) list(rho = as.numeric(result$estimate), p_value = result$p.value) } @@ -504,12 +495,9 @@ spearman_rho <- function(x, y) { #' @param y Numeric vector. #' @return Named list: `tau`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_kendall_tau(x = rnorm(50), y = rnorm(50)) #' @export -kendall_tau <- function(x, y) { +morie_kendall_tau <- function(x, y) { result <- stats::cor.test(x, y, method = "kendall", exact = FALSE) list(tau = as.numeric(result$estimate), p_value = result$p.value) } @@ -520,12 +508,10 @@ kendall_tau <- function(x, y) { #' @param continuous_var Continuous numeric vector. #' @return Named list: `r`, `p_value`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -point_biserial_r <- function(binary_var, continuous_var) { +morie_point_biserial_r <- function(binary_var, continuous_var) { result <- stats::cor.test(binary_var, continuous_var) list(r = as.numeric(result$estimate), p_value = result$p.value) } @@ -550,11 +536,11 @@ point_biserial_r <- function(binary_var, continuous_var) { #' @return Result of `stats::power.t.test()`. #' @export #' @examples -#' power_t_test(n = NULL, delta = 0.5, power = 0.80) -power_t_test <- function(n = NULL, delta = NULL, sd = 1, - sig_level = 0.05, power = NULL, - alternative = c("two.sided", "one.sided"), - type = c("two.sample", "one.sample", "paired")) { +#' morie_power_t_test(n = NULL, delta = 0.5, power = 0.80) +morie_power_t_test <- function(n = NULL, delta = NULL, sd = 1, + sig_level = 0.05, power = NULL, + alternative = c("two.sided", "one.sided"), + type = c("two.sample", "one.sample", "paired")) { alternative <- match.arg(alternative) type <- match.arg(type) stats::power.t.test( @@ -577,10 +563,10 @@ power_t_test <- function(n = NULL, delta = NULL, sd = 1, #' @return Result of `stats::power.prop.test()`. #' @export #' @examples -#' power_prop_test(p1 = 0.30, p2 = 0.20, power = 0.80) -power_prop_test <- function(n = NULL, p1 = NULL, p2 = NULL, - sig_level = 0.05, power = NULL, - alternative = c("two.sided", "one.sided")) { +#' morie_power_prop_test(p1 = 0.30, p2 = 0.20, power = 0.80) +morie_power_prop_test <- function(n = NULL, p1 = NULL, p2 = NULL, + sig_level = 0.05, power = NULL, + alternative = c("two.sided", "one.sided")) { alternative <- match.arg(alternative) stats::power.prop.test( n = n, p1 = p1, p2 = p2, @@ -601,17 +587,15 @@ power_prop_test <- function(n = NULL, p1 = NULL, p2 = NULL, #' @param two_sided Logical. #' @return Integer sample size. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export #' @references #' Hsieh FY, Bloch DA, Larsen MD (1998). A simple method of sample size #' calculation for linear and logistic regression. #' *Statistics in Medicine*, 17(14):1623-1634. -sample_size_logistic <- function(p0, or, alpha = 0.05, power = 0.80, - two_sided = TRUE) { +morie_sample_size_logistic <- function(p0, or, alpha = 0.05, power = 0.80, + two_sided = TRUE) { p1 <- (or * p0) / (1 - p0 + or * p0) z_a <- stats::qnorm(if (two_sided) 1 - alpha / 2 else 1 - alpha) z_b <- stats::qnorm(power) diff --git a/r-package/morie/R/inspector.R b/r-package/morie/R/inspector.R index c8060d65cc..40e50f1ff9 100644 --- a/r-package/morie/R/inspector.R +++ b/r-package/morie/R/inspector.R @@ -15,10 +15,10 @@ #' tmp <- tempfile(fileext = ".json") #' if (requireNamespace("jsonlite", quietly = TRUE)) { #' jsonlite::write_json(list(estimate = 0.123, se = 0.045), tmp) -#' inspect_output(tmp) +#' morie_inspect_output(tmp) #' unlink(tmp) #' } -inspect_output <- function(path) { +morie_inspect_output <- function(path) { result <- list( path = path, format = tools::file_ext(path), @@ -30,31 +30,36 @@ inspect_output <- function(path) { return(result) } ext <- tolower(tools::file_ext(path)) - result$contents_preview <- tryCatch({ - if (ext == "json") { - if (!requireNamespace("jsonlite", quietly = TRUE)) { - result$status <- "jsonlite-unavailable" + result$contents_preview <- tryCatch( + { + if (ext == "json") { + if (!requireNamespace("jsonlite", quietly = TRUE)) { + result$status <- "jsonlite-unavailable" + return(result) + } + obj <- jsonlite::fromJSON(path) + if (is.list(obj)) names(obj) else utils::head(obj) + } else if (ext == "csv") { + df <- utils::read.csv(path, nrows = 5L) + result$n_columns <- ncol(df) + utils::head(df) + } else if (ext == "rds") { + obj <- readRDS(path) + result$class <- class(obj) + if (is.data.frame(obj)) utils::head(obj) else utils::head(names(obj)) + } else { + result$status <- paste0("unsupported-extension: ", ext) return(result) } - obj <- jsonlite::fromJSON(path) - if (is.list(obj)) names(obj) else utils::head(obj) - } else if (ext == "csv") { - df <- utils::read.csv(path, nrows = 5L) - result$n_columns <- ncol(df) - utils::head(df) - } else if (ext == "rds") { - obj <- readRDS(path) - result$class <- class(obj) - if (is.data.frame(obj)) utils::head(obj) else utils::head(names(obj)) - } else { - result$status <- paste0("unsupported-extension: ", ext) - return(result) + }, + error = function(e) { + # <<- so the read-error status reaches the enclosing `result`; + # a plain <- would assign only in this handler's environment. + result$status <<- paste0("read-error: ", conditionMessage(e)) + NULL } - }, error = function(e) { - result$status <- paste0("read-error: ", conditionMessage(e)) - NULL - }) - result$status <- "ok" + ) + if (is.null(result$status)) result$status <- "ok" result } @@ -79,20 +84,23 @@ inspect_output <- function(path) { #' if (requireNamespace("jsonlite", quietly = TRUE)) { #' jsonlite::write_json( #' list(ate = 0.5, se = 0.1, ci_lower = 0.3, ci_upper = 0.7, n = 200), -#' tmp, auto_unbox = TRUE +#' tmp, +#' auto_unbox = TRUE #' ) -#' verify_statistical_output(tmp) +#' morie_verify_statistical_output(tmp) #' unlink(tmp) #' } -verify_statistical_output <- function(path) { +morie_verify_statistical_output <- function(path) { out <- list(path = path, passed = FALSE, checks = list()) if (!file.exists(path)) { out$checks$file_exists <- FALSE return(out) } - if (!requireNamespace("jsonlite", quietly = TRUE)) - stop("jsonlite is required for verify_statistical_output().", - call. = FALSE) + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("jsonlite is required for morie_verify_statistical_output().", + call. = FALSE + ) + } obj <- tryCatch(jsonlite::fromJSON(path), error = function(e) NULL) if (is.null(obj)) { @@ -103,25 +111,30 @@ verify_statistical_output <- function(path) { has <- function(k) !is.null(obj[[k]]) && length(obj[[k]]) >= 1L - if (has("se")) + if (has("se")) { out$checks$se_nonneg <- isTRUE(as.numeric(obj$se) >= 0) - if (has("ci_lower") && has("ci_upper")) + } + if (has("ci_lower") && has("ci_upper")) { out$checks$ci_ordered <- isTRUE(as.numeric(obj$ci_lower) < - as.numeric(obj$ci_upper)) - if (has("ate") && has("ci_lower") && has("ci_upper")) + as.numeric(obj$ci_upper)) + } + if (has("ate") && has("ci_lower") && has("ci_upper")) { out$checks$estimate_in_ci <- isTRUE(as.numeric(obj$ate) >= as.numeric(obj$ci_lower) && - as.numeric(obj$ate) <= as.numeric(obj$ci_upper)) - if (has("n")) + as.numeric(obj$ate) <= as.numeric(obj$ci_upper)) + } + if (has("n")) { out$checks$n_positive <- isTRUE(as.numeric(obj$n) > 0) + } if (has("p_value")) { p <- as.numeric(obj$p_value) out$checks$p_in_unit <- isTRUE(p >= 0 && p <= 1) } - if (has("ate")) + if (has("ate")) { out$checks$estimate_finite <- isTRUE(is.finite(as.numeric(obj$ate))) + } out$passed <- length(out$checks) > 0L && - all(unlist(out$checks)) + all(unlist(out$checks)) out } diff --git a/r-package/morie/R/investigation.R b/r-package/morie/R/investigation.R index 6f88d7b8ed..e9cbae1fd4 100644 --- a/r-package/morie/R/investigation.R +++ b/r-package/morie/R/investigation.R @@ -20,11 +20,12 @@ #' y = rbinom(200, 1, 0.4), #' x1 = rnorm(200), #' x2 = rnorm(200), -#' w = runif(200, 0.5, 1.5) +#' w = runif(200, 0.5, 1.5) #' ) -#' run_weighted_logistic_analysis(df, -#' outcome = "y", predictors = c("x1", "x2"), weights_col = "w") -run_weighted_logistic_analysis <- function(data, outcome, predictors, +#' morie_run_weighted_logistic_analysis(df, +#' outcome = "y", predictors = c("x1", "x2"), weights_col = "w" +#' ) +morie_run_weighted_logistic_analysis <- function(data, outcome, predictors, weights_col = NULL) { fml <- stats::as.formula( paste(outcome, "~", paste(predictors, collapse = " + ")) @@ -36,13 +37,17 @@ run_weighted_logistic_analysis <- function(data, outcome, predictors, design <- survey::svydesign( ids = ~1, data = data, weights = stats::as.formula(paste0("~", weights_col)) ) - fit <- survey::svyglm(fml, design = design, - family = stats::quasibinomial()) + fit <- survey::svyglm(fml, + design = design, + family = stats::quasibinomial() + ) method <- "svyglm" } else { w <- if (!is.null(weights_col)) data[[weights_col]] else NULL - fit <- stats::glm(fml, data = data, family = stats::binomial(), - weights = w) + fit <- stats::glm(fml, + data = data, family = stats::binomial(), + weights = w + ) method <- if (is.null(w)) "glm-unweighted" else "glm-weighted" } coef_summary <- stats::coef(summary(fit)) @@ -76,16 +81,19 @@ run_weighted_logistic_analysis <- function(data, outcome, predictors, #' y = rbinom(200, 1, 0.4), #' x1 = rnorm(200), x2 = rnorm(200), x3 = rnorm(200) #' ) -#' compare_nested_logistic_models(df, +#' morie_compare_nested_logistic_models(df, #' outcome = "y", -#' predictors_full = c("x1", "x2", "x3"), -#' predictors_reduced = c("x1")) -compare_nested_logistic_models <- function(data, outcome, +#' predictors_full = c("x1", "x2", "x3"), +#' predictors_reduced = c("x1") +#' ) +morie_compare_nested_logistic_models <- function(data, outcome, predictors_full, predictors_reduced) { - if (!all(predictors_reduced %in% predictors_full)) + if (!all(predictors_reduced %in% predictors_full)) { stop("predictors_reduced must be a subset of predictors_full.", - call. = FALSE) + call. = FALSE + ) + } fml_full <- stats::as.formula( paste(outcome, "~", paste(predictors_full, collapse = " + ")) @@ -94,14 +102,18 @@ compare_nested_logistic_models <- function(data, outcome, paste(outcome, "~", paste(predictors_reduced, collapse = " + ")) ) - fit_full <- stats::glm(fml_full, data = data, - family = stats::binomial()) - fit_red <- stats::glm(fml_red, data = data, - family = stats::binomial()) + fit_full <- stats::glm(fml_full, + data = data, + family = stats::binomial() + ) + fit_red <- stats::glm(fml_red, + data = data, + family = stats::binomial() + ) chi_sq <- as.numeric(stats::deviance(fit_red) - stats::deviance(fit_full)) - df <- length(predictors_full) - length(predictors_reduced) - p_val <- stats::pchisq(chi_sq, df = df, lower.tail = FALSE) + df <- length(predictors_full) - length(predictors_reduced) + p_val <- stats::pchisq(chi_sq, df = df, lower.tail = FALSE) list( chi_sq = chi_sq, @@ -116,7 +128,7 @@ compare_nested_logistic_models <- function(data, outcome, #' Run a treatment-effects analysis (point estimate, SE, 95% CI) #' #' Mirrors the Python `morie.run_treatment_effects_analysis()`. Convenience -#' wrapper around [estimate_ate()] that also produces a 95% confidence +#' wrapper around [morie_estimate_ate()] that also produces a 95% confidence #' interval (delta-method approximation). #' #' @param data A `data.frame`. @@ -133,12 +145,15 @@ compare_nested_logistic_models <- function(data, outcome, #' t = rbinom(200, 1, 0.5), #' x1 = rnorm(200), x2 = rnorm(200) #' ) -#' run_treatment_effects_analysis(df, -#' treatment = "t", outcome = "y", covariates = c("x1", "x2")) -run_treatment_effects_analysis <- function(data, treatment, outcome, +#' morie_run_treatment_effects_analysis(df, +#' treatment = "t", outcome = "y", covariates = c("x1", "x2") +#' ) +morie_run_treatment_effects_analysis <- function(data, treatment, outcome, covariates) { - ate_res <- estimate_ate(data, treatment = treatment, outcome = outcome, - covariates = covariates) + ate_res <- morie_estimate_ate(data, + treatment = treatment, outcome = outcome, + covariates = covariates + ) list( ate = ate_res$ate, se = ate_res$se, diff --git a/r-package/morie/R/ipw.R b/r-package/morie/R/ipw.R index 1c74d9137b..af89f6cccc 100644 --- a/r-package/morie/R/ipw.R +++ b/r-package/morie/R/ipw.R @@ -2,12 +2,9 @@ #' #' @return Named list describing the expected local CPADS contract. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_cpads_contract() #' @export -cpads_contract <- function() { +morie_cpads_contract <- function() { list( source_kind = "local_private_file", expected_wrangled_path = "data/cache/cpads_pumf_wrangled.rds", @@ -34,13 +31,11 @@ cpads_contract <- function() { #' @param strict If `TRUE`, stop when required variables are missing. #' @return Character vector of missing variable names. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -validate_cpads_data <- function(data, strict = TRUE) { - required <- cpads_contract()$required_variables +morie_validate_cpads_data <- function(data, strict = TRUE) { + required <- morie_cpads_contract()$required_variables missing <- setdiff(required, names(data)) if (isTRUE(strict) && length(missing) > 0) { stop("CPADS data is missing required variables: ", paste(missing, collapse = ", "), call. = FALSE) @@ -67,8 +62,27 @@ validate_cpads_data <- function(data, strict = TRUE) { #' @param outcome Binary outcome column. #' @param covariates Covariate names for the propensity model. #' @return Named list of output tables and the analysis data. +#' @examples +#' # Run on a synthetic CPADS-shaped frame (the CKAN-fetched PUMF works +#' # identically -- see morie_load_cpads_data() for the real frame): +#' set.seed(1) +#' n <- 200 +#' cpads <- data.frame( +#' weight = runif(n, 0.5, 2), +#' alcohol_past12m = rbinom(n, 1, 0.8), +#' heavy_drinking_30d = rbinom(n, 1, 0.3), +#' ebac_tot = abs(rnorm(n, 0.05, 0.03)), +#' ebac_legal = rbinom(n, 1, 0.7), +#' cannabis_any_use = rbinom(n, 1, 0.3), +#' age_group = sample(1:6, n, TRUE), +#' gender = sample(1:2, n, TRUE), +#' province_region = sample(1:5, n, TRUE), +#' mental_health = sample(1:5, n, TRUE), +#' physical_health = sample(1:5, n, TRUE) +#' ) +#' morie_run_propensity_ipw_analysis(cpads) #' @export -run_propensity_ipw_analysis <- function( +morie_run_propensity_ipw_analysis <- function( data, output_dir = NULL, trim = c(0.01, 0.99), @@ -76,7 +90,7 @@ run_propensity_ipw_analysis <- function( outcome = "heavy_drinking_30d", covariates = c("age_group", "gender", "province_region", "mental_health", "physical_health") ) { - validate_cpads_data(data, strict = TRUE) + morie_validate_cpads_data(data, strict = TRUE) needed <- unique(c(treatment, outcome, covariates, "weight")) frame <- stats::na.omit(data[, needed, drop = FALSE]) @@ -141,16 +155,37 @@ run_propensity_ipw_analysis <- function( #' @param treatment Treatment column name. #' @param covariates Covariate names used in the observation model. #' @return Named list of output tables and the observed-domain analysis frame. +#' @examples +#' # Run on a synthetic CPADS-shaped frame (the CKAN-fetched PUMF works +#' # identically -- see morie_load_cpads_data() for the real frame): +#' if (requireNamespace("survey", quietly = TRUE)) { +#' set.seed(1) +#' n <- 200 +#' cpads <- data.frame( +#' weight = runif(n, 0.5, 2), +#' alcohol_past12m = rbinom(n, 1, 0.8), +#' heavy_drinking_30d = rbinom(n, 1, 0.3), +#' ebac_tot = abs(rnorm(n, 0.05, 0.03)), +#' ebac_legal = rbinom(n, 1, 0.7), +#' cannabis_any_use = rbinom(n, 1, 0.3), +#' age_group = sample(1:6, n, TRUE), +#' gender = sample(1:2, n, TRUE), +#' province_region = sample(1:5, n, TRUE), +#' mental_health = sample(1:5, n, TRUE), +#' physical_health = sample(1:5, n, TRUE) +#' ) +#' morie_run_ebac_selection_ipw_analysis(cpads) +#' } #' @export -run_ebac_selection_ipw_analysis <- function( +morie_run_ebac_selection_ipw_analysis <- function( data, output_dir = NULL, treatment = "cannabis_any_use", covariates = c("age_group", "gender", "province_region", "mental_health", "physical_health") ) { - validate_cpads_data(data, strict = TRUE) + morie_validate_cpads_data(data, strict = TRUE) if (!requireNamespace("survey", quietly = TRUE)) { - stop("The `survey` package is required for run_ebac_selection_ipw_analysis().", call. = FALSE) + stop("The `survey` package is required for morie_run_ebac_selection_ipw_analysis().", call. = FALSE) } needed <- unique(c( @@ -176,7 +211,10 @@ run_ebac_selection_ipw_analysis <- function( observed$w_combined_trim <- observed$weight * observed$sw_trim diag_tbl <- data.frame( - metric = c("eligible_n", "observed_n", "observed_rate", "sw_min", "sw_q01", "sw_q99", "sw_max", "ess_survey_x_ipw_trim"), + metric = c( + "eligible_n", "observed_n", "observed_rate", + "sw_min", "sw_q01", "sw_q99", "sw_max", "ess_survey_x_ipw_trim" + ), value = c( nrow(target), nrow(observed), @@ -246,7 +284,11 @@ run_ebac_selection_ipw_analysis <- function( utils::write.csv(diag_tbl, file.path(output_dir, "ebac_final_ipw_diagnostics.csv"), row.names = FALSE) utils::write.csv(ebac_final_ipw_or, file.path(output_dir, "ebac_final_ipw_or.csv"), row.names = FALSE) utils::write.csv(ebac_final_ipw_linear, file.path(output_dir, "ebac_final_ipw_linear.csv"), row.names = FALSE) - utils::write.csv(ebac_final_ipw_comparison, file.path(output_dir, "ebac_final_ipw_comparison.csv"), row.names = FALSE) + utils::write.csv( + ebac_final_ipw_comparison, + file.path(output_dir, "ebac_final_ipw_comparison.csv"), + row.names = FALSE + ) } list( diff --git a/r-package/morie/R/ipw_weights.R b/r-package/morie/R/ipw_weights.R index 8342aaad9e..753b31b9f6 100644 --- a/r-package/morie/R/ipw_weights.R +++ b/r-package/morie/R/ipw_weights.R @@ -26,13 +26,13 @@ #' t = rbinom(100, 1, 0.4), #' ps = pmin(pmax(runif(100, 0.05, 0.95), 0.05), 0.95) #' ) -#' w <- calculate_ipw_weights(df, treatment = "t", ps_col = "ps") +#' w <- morie_calculate_ipw_weights(df, treatment = "t", ps_col = "ps") #' summary(w) -calculate_ipw_weights <- function(data, treatment, ps_col, +morie_calculate_ipw_weights <- function(data, treatment, ps_col, stabilized = FALSE, trim_quantiles = NULL) { ps <- pmin(pmax(data[[ps_col]], 0.01), 0.99) - t <- data[[treatment]] + t <- data[[treatment]] if (stabilized) { p_treated <- mean(t) weights <- ifelse(t == 1, p_treated / ps, (1 - p_treated) / (1 - ps)) @@ -40,8 +40,9 @@ calculate_ipw_weights <- function(data, treatment, ps_col, weights <- (t / ps) + ((1 - t) / (1 - ps)) } if (!is.null(trim_quantiles)) { - if (length(trim_quantiles) != 2L) + if (length(trim_quantiles) != 2L) { stop("trim_quantiles must be length 2 (lower, upper).") + } qs <- stats::quantile(weights, probs = trim_quantiles, na.rm = TRUE) weights <- pmin(pmax(weights, qs[[1L]]), qs[[2L]]) } diff --git a/r-package/morie/R/irm.R b/r-package/morie/R/irm.R index fa7379d85f..667fcd36e1 100644 --- a/r-package/morie/R/irm.R +++ b/r-package/morie/R/irm.R @@ -46,8 +46,8 @@ #' @examples #' \donttest{ #' if (requireNamespace("DoubleML", quietly = TRUE) && -#' requireNamespace("mlr3", quietly = TRUE) && -#' requireNamespace("mlr3learners", quietly = TRUE)) { +#' requireNamespace("mlr3", quietly = TRUE) && +#' requireNamespace("mlr3learners", quietly = TRUE)) { #' set.seed(1) #' n <- 200 #' X <- matrix(rnorm(n * 5), n, 5) @@ -55,16 +55,18 @@ #' T <- rbinom(n, 1, ps) #' Y <- 0.5 * T + X[, 1] + rnorm(n) #' df <- data.frame(Y = Y, T = T, X) -#' estimate_irm(df, treatment = "T", outcome = "Y", -#' covariates = paste0("X", 1:5)) +#' morie_estimate_irm(df, +#' treatment = "T", outcome = "Y", +#' covariates = paste0("X", 1:5) +#' ) #' } #' } -estimate_irm <- function(data, treatment, outcome, covariates, +morie_estimate_irm <- function(data, treatment, outcome, covariates, n_folds = 5, random_state = 42) { for (pkg in c("DoubleML", "mlr3", "mlr3learners")) { if (!requireNamespace(pkg, quietly = TRUE)) { stop(sprintf( - "Package %s is required for estimate_irm(). Install with: install.packages(%s)", + "Package %s is required for morie_estimate_irm(). Install with: install.packages(%s)", sQuote(pkg), sQuote(pkg) ), call. = FALSE) } @@ -74,8 +76,9 @@ estimate_irm <- function(data, treatment, outcome, covariates, frame <- stats::na.omit(data[, cols, drop = FALSE]) for (col in covariates) { - if (!is.numeric(frame[[col]])) + if (!is.numeric(frame[[col]])) { frame[[col]] <- as.numeric(as.factor(frame[[col]])) + } } set.seed(random_state) @@ -94,8 +97,8 @@ estimate_irm <- function(data, treatment, outcome, covariates, dml_irm$fit() ate <- as.numeric(dml_irm$coef)[[1L]] - se <- as.numeric(dml_irm$se)[[1L]] - z <- 1.959964 + se <- as.numeric(dml_irm$se)[[1L]] + z <- 1.959964 list( ate = ate, diff --git a/r-package/morie/R/irtsp.R b/r-package/morie/R/irtsp.R index 3ccb9677c8..f60708f4be 100644 --- a/r-package/morie/R/irtsp.R +++ b/r-package/morie/R/irtsp.R @@ -11,36 +11,47 @@ #' @return Named list with `x_hat`, `alpha`, `beta`, `loglik`, `n_iter`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' irtsp(x = rnorm(50)) #' @export irtsp <- function(x, n_iter = 60L, tol = 1e-6) { logistic <- function(z) 1 / (1 + exp(-pmin(pmax(z, -30), 30))) M <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) - n <- nrow(M); m <- ncol(M) - if (n < 2L || m < 1L) - return(list(x_hat = rep(NA_real_, n), alpha = rep(NA_real_, m), - beta = rep(NA_real_, m), loglik = NA_real_, - n_iter = 0L, method = "irt_spatial")) + n <- nrow(M) + m <- ncol(M) + if (n < 2L || m < 1L) { + return(list( + x_hat = rep(NA_real_, n), alpha = rep(NA_real_, m), + beta = rep(NA_real_, m), loglik = NA_real_, + n_iter = 0L, method = "morie_irt_spatial" + )) + } Mc <- M - matrix(colMeans(M, na.rm = TRUE), n, m, byrow = TRUE) Mc[is.na(Mc)] <- 0 sv <- tryCatch(svd(Mc, nu = 1L, nv = 0L), - error = function(e) NULL) - x_hat <- if (!is.null(sv)) sv$u[, 1] * sv$d[1] - else seq(-1, 1, length.out = n) + error = function(e) NULL + ) + x_hat <- if (!is.null(sv)) { + sv$u[, 1] * sv$d[1] + } else { + seq(-1, 1, length.out = n) + } x_hat <- (x_hat - mean(x_hat)) / (stats::sd(x_hat) + 1e-12) - alpha <- rep(1, m); beta <- rep(0, m) + alpha <- rep(1, m) + beta <- rep(0, m) prev_ll <- -Inf it <- 0L for (it in seq_len(n_iter)) { for (j in seq_len(m)) { - yj <- M[, j]; mask <- !is.na(yj) - xj <- x_hat[mask]; yjm <- yj[mask] - a <- alpha[j]; b <- beta[j] + yj <- M[, j] + mask <- !is.na(yj) + xj <- x_hat[mask] + yjm <- yj[mask] + a <- alpha[j] + b <- beta[j] for (k in 1:5) { - z <- a * (xj - b); p <- logistic(z); w <- p * (1 - p) + 1e-9 + z <- a * (xj - b) + p <- logistic(z) + w <- p * (1 - p) + 1e-9 ga <- sum((yjm - p) * (xj - b)) gb <- sum((yjm - p) * (-a)) Haa <- -sum(w * (xj - b)^2) @@ -49,19 +60,27 @@ irtsp <- function(x, n_iter = 60L, tol = 1e-6) { H <- matrix(c(Haa, Hab, Hab, Hbb), 2, 2) g <- c(ga, gb) step <- tryCatch(solve(H - 1e-6 * diag(2), g), - error = function(e) NULL) + error = function(e) NULL + ) if (is.null(step)) break - a <- a - step[1]; b <- b - step[2] + a <- a - step[1] + b <- b - step[2] if (max(abs(step)) < tol) break } - alpha[j] <- a; beta[j] <- b + alpha[j] <- a + beta[j] <- b } for (i in seq_len(n)) { - yi <- M[i, ]; mask <- !is.na(yi) - aj <- alpha[mask]; bj <- beta[mask]; yim <- yi[mask] + yi <- M[i, ] + mask <- !is.na(yi) + aj <- alpha[mask] + bj <- beta[mask] + yim <- yi[mask] xi <- x_hat[i] for (k in 1:5) { - z <- aj * (xi - bj); p <- logistic(z); w <- p * (1 - p) + 1e-9 + z <- aj * (xi - bj) + p <- logistic(z) + w <- p * (1 - p) + 1e-9 g <- sum(aj * (yim - p)) H <- -sum(w * aj^2) if (abs(H) < 1e-12) break @@ -77,16 +96,19 @@ irtsp <- function(x, n_iter = 60L, tol = 1e-6) { P <- logistic(Z) mask_full <- !is.na(M) ll <- sum(ifelse(mask_full, - M * log(P + 1e-12) + (1 - M) * log(1 - P + 1e-12), - 0)) + M * log(P + 1e-12) + (1 - M) * log(1 - P + 1e-12), + 0 + )) if (abs(ll - prev_ll) < tol * max(1, abs(prev_ll))) break prev_ll <- ll } - list(x_hat = x_hat, alpha = alpha, beta = beta, - loglik = ll, n_iter = it, method = "irt_spatial_2pl") + list( + x_hat = x_hat, alpha = alpha, beta = beta, + loglik = ll, n_iter = it, method = "irt_spatial_2pl" + ) } #' @keywords internal #' @rdname irtsp #' @export -irt_spatial <- irtsp +morie_irt_spatial <- irtsp diff --git a/r-package/morie/R/isotn.R b/r-package/morie/R/isotn.R index 57a1331544..5e23cd89fd 100644 --- a/r-package/morie/R/isotn.R +++ b/r-package/morie/R/isotn.R @@ -11,12 +11,17 @@ #' @return list: x_sorted, fitted, residuals, sse, r2, n, method. #' @keywords internal isotn <- function(x, y, weights = NULL, increasing = TRUE) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - if (n < 2L || length(y) != n) + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + if (n < 2L || length(y) != n) { return(list(estimate = NA_real_, n = n, method = "Isotonic (n<2)")) + } if (is.null(weights)) weights <- rep(1, n) ord <- order(x) - xs <- x[ord]; ys <- y[ord]; ws <- weights[ord] + xs <- x[ord] + ys <- y[ord] + ws <- weights[ord] if (increasing) { fit <- stats::isoreg(xs, ys) fitted <- fit$yf @@ -28,10 +33,12 @@ isotn <- function(x, y, weights = NULL, increasing = TRUE) { sse <- sum(ws * resid^2) sst <- sum(ws * (ys - stats::weighted.mean(ys, ws))^2) r2 <- if (sst > 0) 1 - sse / sst else NA_real_ - list(x_sorted = xs, fitted = fitted, residuals = resid, - sse = sse, r2 = as.numeric(r2), - estimate = mean(fitted), n = as.integer(n), - method = "Isotonic regression (Barlow et al. 1972, PAVA)") + list( + x_sorted = xs, fitted = fitted, residuals = resid, + sse = sse, r2 = as.numeric(r2), + estimate = mean(fitted), n = as.integer(n), + method = "Isotonic regression (Barlow et al. 1972, PAVA)" + ) } # CANONICAL TEST @@ -42,4 +49,4 @@ isotn <- function(x, y, weights = NULL, increasing = TRUE) { #' @rdname isotn #' @keywords internal #' @export -isotonic_regression <- isotn +morie_isotonic_regression <- isotn diff --git a/r-package/morie/R/jkest.R b/r-package/morie/R/jkest.R index c935aac24c..42f1aa697d 100644 --- a/r-package/morie/R/jkest.R +++ b/r-package/morie/R/jkest.R @@ -11,26 +11,31 @@ #' @references Efron & Tibshirani (1993), Ch. 11. #' @keywords internal jkest <- function(x, statistic = NULL) { - x <- as.numeric(x); n <- length(x) + x <- as.numeric(x) + n <- length(x) if (n < 2L) { - return(list(estimate = NA_real_, bias = NA_real_, se = NA_real_, - n = n, method = "Jackknife (n<2)")) + return(list( + estimate = NA_real_, bias = NA_real_, se = NA_real_, + n = n, method = "Jackknife (n<2)" + )) } if (is.null(statistic)) statistic <- mean T_hat <- statistic(x) T_loo <- vapply(seq_len(n), function(i) statistic(x[-i]), numeric(1)) T_bar <- mean(T_loo) - bias <- (n - 1) * (T_bar - T_hat) + bias <- (n - 1) * (T_bar - T_hat) T_jack <- n * T_hat - (n - 1) * T_bar var_jack <- (n - 1) / n * sum((T_loo - T_bar)^2) se <- sqrt(var_jack) - list(estimate = as.numeric(T_jack), - theta_hat = as.numeric(T_hat), - bias = as.numeric(bias), - var = as.numeric(var_jack), - se = as.numeric(se), - n = as.integer(n), - method = "Jackknife (Quenouille 1956)") + list( + estimate = as.numeric(T_jack), + theta_hat = as.numeric(T_hat), + bias = as.numeric(bias), + var = as.numeric(var_jack), + se = as.numeric(se), + n = as.integer(n), + method = "Jackknife (Quenouille 1956)" + ) } # CANONICAL TEST @@ -40,4 +45,4 @@ jkest <- function(x, statistic = NULL) { #' @rdname jkest #' @keywords internal #' @export -jackknife_estimator <- jkest +morie_jackknife_estimator <- jkest diff --git a/r-package/morie/R/johsn.R b/r-package/morie/R/johsn.R index 15d6cb8cb4..72b523747d 100644 --- a/r-package/morie/R/johsn.R +++ b/r-package/morie/R/johsn.R @@ -7,24 +7,29 @@ #' @return Named list with \code{eigenvalues, trace_stat, crit_values, #' rank, n, k, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -johansen_cointegration <- function(x, k_ar_diff = 1) { - Y <- as.matrix(x); if (nrow(Y) < ncol(Y)) Y <- t(Y) - Tt <- nrow(Y); k <- ncol(Y) +morie_johansen_cointegration <- function(x, k_ar_diff = 1) { + Y <- as.matrix(x) + if (nrow(Y) < ncol(Y)) Y <- t(Y) + Tt <- nrow(Y) + k <- ncol(Y) if (Tt < 20 || k < 2) stop("Need T>=20, k>=2.") + if (is.null(colnames(Y))) colnames(Y) <- paste0("y", seq_len(k)) if (requireNamespace("urca", quietly = TRUE)) { - jres <- urca::ca.jo(Y, type = "trace", ecdet = "none", - K = max(k_ar_diff + 1, 2)) - return(list(eigenvalues = jres@lambda, - trace_stat = jres@teststat, - crit_values = jres@cval, - rank = sum(jres@teststat > jres@cval[, "5pct"]), - n = Tt, k = k, - method = "Johansen trace test via urca::ca.jo")) + jres <- urca::ca.jo(Y, + type = "trace", ecdet = "none", + K = max(k_ar_diff + 1, 2) + ) + return(list( + eigenvalues = jres@lambda, + trace_stat = jres@teststat, + crit_values = jres@cval, + rank = sum(jres@teststat > jres@cval[, "5pct"]), + n = Tt, k = k, + method = "Johansen trace test via urca::ca.jo" + )) } dY <- diff(Y) rows <- nrow(dY) - k_ar_diff @@ -32,28 +37,36 @@ johansen_cointegration <- function(x, k_ar_diff = 1) { Z1 <- Y[(k_ar_diff + 1):(k_ar_diff + rows), , drop = FALSE] Z2 <- matrix(1, rows, 1) if (k_ar_diff > 0) { - for (i in seq_len(k_ar_diff)) + for (i in seq_len(k_ar_diff)) { Z2 <- cbind(Z2, dY[(k_ar_diff - i + 1):(k_ar_diff - i + rows), , - drop = FALSE]) + drop = FALSE + ]) + } } P <- Z2 %*% solve(crossprod(Z2)) %*% t(Z2) - R0 <- Z0 - P %*% Z0; R1 <- Z1 - P %*% Z1 - S00 <- crossprod(R0) / rows; S01 <- crossprod(R0, R1) / rows + R0 <- Z0 - P %*% Z0 + R1 <- Z1 - P %*% Z1 + S00 <- crossprod(R0) / rows + S01 <- crossprod(R0, R1) / rows S11 <- crossprod(R1) / rows M <- solve(S11) %*% t(S01) %*% solve(S00) %*% S01 eig <- sort(Re(eigen(M, only.values = TRUE)$values), decreasing = TRUE) eig <- pmax(pmin(eig, 1 - 1e-12), 0) - trace_stat <- sapply(0:(k - 1), function(r) -rows * sum(log(1 - eig[(r + 1):k]))) + trace_stat <- vapply(0:(k - 1), function(r) -rows * sum(log(1 - eig[(r + 1):k])), numeric(1)) crit_table <- list( `1` = c(2.7055, 3.8415, 6.6349), `2` = c(13.4294, 15.4943, 19.9349), `3` = c(27.0669, 29.7961, 35.4628), `4` = c(44.4929, 47.8545, 54.6815), `5` = c(65.8202, 69.8189, 77.8202) ) - crit_values <- do.call(rbind, lapply(seq_len(k), - function(r) crit_table[[as.character(k - r + 1)]] %||% c(NA, NA, NA))) + crit_values <- do.call(rbind, lapply( + seq_len(k), + function(r) crit_table[[as.character(k - r + 1)]] %||% c(NA, NA, NA) + )) rank <- sum(trace_stat > crit_values[, 2]) - list(eigenvalues = eig, trace_stat = trace_stat, - crit_values = crit_values, rank = rank, - n = Tt, k = k, - method = "Johansen trace test (reduced-rank regression, base R)") + list( + eigenvalues = eig, trace_stat = trace_stat, + crit_values = crit_values, rank = rank, + n = Tt, k = k, + method = "Johansen trace test (reduced-rank regression, base R)" + ) } diff --git a/r-package/morie/R/kalmn.R b/r-package/morie/R/kalmn.R index 7c14682077..132f5ccb07 100644 --- a/r-package/morie/R/kalmn.R +++ b/r-package/morie/R/kalmn.R @@ -5,7 +5,7 @@ #' Defaults to a univariate local-level model when matrices are omitted. #' #' @param x Numeric vector or matrix of observations. -#' @param F Transition matrix (default identity). +#' @param transition Transition matrix (default identity). #' @param H Observation matrix (default identity). #' @param Q State-innovation covariance (default sigma^2 I). #' @param R Observation covariance (default sigma^2 I). @@ -14,37 +14,47 @@ #' @return Named list with \code{state, state_cov, innovations, #' innovation_variance, loglik, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_kalman_filter(x = rnorm(50)) #' @export -kalman_filter <- function(x, F = NULL, H = NULL, Q = NULL, R = NULL, - x0 = NULL, P0 = NULL) { - Y <- as.matrix(x); n <- nrow(Y); m <- ncol(Y) +morie_kalman_filter <- function(x, transition = NULL, H = NULL, Q = NULL, R = NULL, + x0 = NULL, P0 = NULL) { + Y <- as.matrix(x) + n <- nrow(Y) + m <- ncol(Y) if (n < 2) stop("Need >=2 obs.") - if (is.null(F)) F <- diag(m) + if (is.null(transition)) transition <- diag(m) if (is.null(H)) H <- diag(m) v0 <- var(diff(Y)) * 0.5 if (is.null(Q)) Q <- if (is.matrix(v0)) v0 else diag(as.numeric(v0), m) if (is.null(R)) R <- if (is.matrix(v0)) v0 else diag(as.numeric(v0), m) - F <- as.matrix(F); H <- as.matrix(H); Q <- as.matrix(Q); R <- as.matrix(R) - p <- nrow(F) - if (is.null(x0)) { x0 <- numeric(p); x0[seq_len(min(p, m))] <- Y[1, seq_len(min(p, m))] } + transition <- as.matrix(transition) + H <- as.matrix(H) + Q <- as.matrix(Q) + R <- as.matrix(R) + p <- nrow(transition) + if (is.null(x0)) { + x0 <- numeric(p) + x0[seq_len(min(p, m))] <- Y[1, seq_len(min(p, m))] + } if (is.null(P0)) P0 <- diag(1e6, p) x_hat <- matrix(0, n, p) P_arr <- array(0, c(n, p, p)) innov <- matrix(0, n, m) Sv <- array(0, c(n, m, m)) - xc <- as.numeric(x0); Pc <- P0; ll <- 0 + xc <- as.numeric(x0) + Pc <- P0 + ll <- 0 for (t in seq_len(n)) { - xp <- F %*% xc - Pp <- F %*% Pc %*% t(F) + Q + xp <- transition %*% xc + Pp <- transition %*% Pc %*% t(transition) + Q v <- Y[t, ] - H %*% xp S <- H %*% Pp %*% t(H) + R Sinv <- tryCatch(solve(S), error = function(e) { - if (requireNamespace("MASS", quietly = TRUE)) MASS::ginv(S) - else solve(S + diag(1e-8, nrow(S))) + if (requireNamespace("MASS", quietly = TRUE)) { + MASS::ginv(S) + } else { + solve(S + diag(1e-8, nrow(S))) + } }) K <- Pp %*% t(H) %*% Sinv xc <- as.numeric(xp + K %*% v) @@ -54,12 +64,15 @@ kalman_filter <- function(x, F = NULL, H = NULL, Q = NULL, R = NULL, innov[t, ] <- as.numeric(v) Sv[t, , ] <- S ld <- determinant(S, logarithm = TRUE) - if (ld$sign > 0) + if (ld$sign > 0) { ll <- ll + -0.5 * (m * log(2 * pi) + ld$modulus + - sum(v * (Sinv %*% v))) + sum(v * (Sinv %*% v))) + } } - list(state = x_hat, state_cov = P_arr, - innovations = innov, innovation_variance = Sv, - loglik = as.numeric(ll), n = n, - method = "Linear Gaussian Kalman filter (base R)") + list( + state = x_hat, state_cov = P_arr, + innovations = innov, innovation_variance = Sv, + loglik = as.numeric(ll), n = n, + method = "Linear Gaussian Kalman filter (base R)" + ) } diff --git a/r-package/morie/R/kmnsc.R b/r-package/morie/R/kmnsc.R index 172a88a14d..8ebe71f871 100644 --- a/r-package/morie/R/kmnsc.R +++ b/r-package/morie/R/kmnsc.R @@ -12,21 +12,20 @@ #' @return Named list: estimate (inertia), labels, centers, inertia, #' n_iter, n_clusters, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_kmeans_clustering(x = rnorm(50)) #' @export -kmeans_clustering <- function(x, n_clusters = 3L, n_init = 10L, - max_iter = 300L, seed = 0L) { +morie_kmeans_clustering <- function(x, n_clusters = 3L, n_init = 10L, + max_iter = 300L, seed = 0L) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) set.seed(seed) - fit <- stats::kmeans(x, centers = n_clusters, iter.max = max_iter, - nstart = n_init, algorithm = "Hartigan-Wong") + fit <- stats::kmeans(x, + centers = n_clusters, iter.max = max_iter, + nstart = n_init, algorithm = "Hartigan-Wong" + ) list( estimate = as.numeric(fit$tot.withinss), - labels = as.integer(fit$cluster - 1L), # 0-indexed for Py parity + labels = as.integer(fit$cluster - 1L), # 0-indexed for Py parity centers = fit$centers, inertia = as.numeric(fit$tot.withinss), n_iter = as.integer(fit$iter), diff --git a/r-package/morie/R/ksr01.R b/r-package/morie/R/ksr01.R index df1dc6f493..a4ac01dfc2 100644 --- a/r-package/morie/R/ksr01.R +++ b/r-package/morie/R/ksr01.R @@ -12,14 +12,11 @@ #' @return Named list: estimate, se, n, method. #' @references Kosorok (2008), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr01_kosorok_empirical_process(x = rnorm(50)) #' @export -ksr01_kosorok_empirical_process <- function(x, f = NULL, mu0 = 0) { - x <- as.numeric(x) - n <- length(x) +morie_ksr01_kosorok_empirical_process <- function(x, f = NULL, mu0 = 0) { + x <- as.numeric(x) + n <- length(x) fx <- if (is.null(f)) x else vapply(x, f, numeric(1)) pn <- mean(fx) estimate <- sqrt(n) * (pn - mu0) @@ -33,9 +30,9 @@ ksr01_kosorok_empirical_process <- function(x, f = NULL, mu0 = 0) { } # CANONICAL TEST -# set.seed(0); xs <- rnorm(200); r <- ksr01_kosorok_empirical_process(xs); r +# set.seed(0); xs <- rnorm(200); r <- morie_ksr01_kosorok_empirical_process(xs); r -#' @rdname ksr01_kosorok_empirical_process +#' @rdname morie_ksr01_kosorok_empirical_process #' @keywords internal #' @export -kosorok_empirical_process <- ksr01_kosorok_empirical_process +morie_kosorok_empirical_process <- morie_ksr01_kosorok_empirical_process diff --git a/r-package/morie/R/ksr02.R b/r-package/morie/R/ksr02.R index c9c1d98bdd..caedef38a6 100644 --- a/r-package/morie/R/ksr02.R +++ b/r-package/morie/R/ksr02.R @@ -3,22 +3,22 @@ #' Donsker-class verification via bracketing integral #' #' Computes J_[](1, F, L_2(P)) = int_0^1 sqrt(log N_brackets(e, F, L_2(P))) de -#' for the indicator class F of one-sided thresholds on X (Kosorok Ex 2.5.4), with bracketing number bounded by 2 over epsilon squared. +#' for the indicator class F of one-sided thresholds on X +#' (Kosorok Ex 2.5.4), with bracketing number bounded by 2 / epsilon^2. #' #' @param x Numeric vector (unused, kept for API parity). #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 2 (Theorem 2.5.2). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr02_kosorok_donsker_class(x = rnorm(50)) #' @export -ksr02_kosorok_donsker_class <- function(x) { +morie_ksr02_kosorok_donsker_class <- function(x) { x <- as.numeric(x) integrand <- function(e) sqrt(log(2) - 2 * log(e)) - j <- stats::integrate(integrand, lower = 1e-8, upper = 1.0, - subdivisions = 200L)$value + j <- stats::integrate(integrand, + lower = 1e-8, upper = 1.0, + subdivisions = 200L + )$value list( estimate = j, n = length(x), @@ -27,9 +27,9 @@ ksr02_kosorok_donsker_class <- function(x) { } # CANONICAL TEST -# ksr02_kosorok_donsker_class(1:10) +# morie_ksr02_kosorok_donsker_class(1:10) -#' @rdname ksr02_kosorok_donsker_class +#' @rdname morie_ksr02_kosorok_donsker_class #' @keywords internal #' @export -kosorok_donsker_class <- ksr02_kosorok_donsker_class +morie_kosorok_donsker_class <- morie_ksr02_kosorok_donsker_class diff --git a/r-package/morie/R/ksr03.R b/r-package/morie/R/ksr03.R index 9ec7ce190a..b00b03cb50 100644 --- a/r-package/morie/R/ksr03.R +++ b/r-package/morie/R/ksr03.R @@ -10,12 +10,9 @@ #' @return Named list with statistic, p_value, n, method. #' @references Kosorok (2008), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr03_kosorok_glivenko_cantelli(x = rnorm(50)) #' @export -ksr03_kosorok_glivenko_cantelli <- function(x, cdf = "pnorm") { +morie_ksr03_kosorok_glivenko_cantelli <- function(x, cdf = "pnorm") { x <- as.numeric(x) n <- length(x) res <- suppressWarnings(stats::ks.test(x, cdf)) @@ -28,9 +25,9 @@ ksr03_kosorok_glivenko_cantelli <- function(x, cdf = "pnorm") { } # CANONICAL TEST -# set.seed(0); ksr03_kosorok_glivenko_cantelli(rnorm(200)) +# set.seed(0); morie_ksr03_kosorok_glivenko_cantelli(rnorm(200)) -#' @rdname ksr03_kosorok_glivenko_cantelli +#' @rdname morie_ksr03_kosorok_glivenko_cantelli #' @keywords internal #' @export -kosorok_glivenko_cantelli <- ksr03_kosorok_glivenko_cantelli +morie_kosorok_glivenko_cantelli <- morie_ksr03_kosorok_glivenko_cantelli diff --git a/r-package/morie/R/ksr04.R b/r-package/morie/R/ksr04.R index ec4c6eae02..cb7ff5d0a4 100644 --- a/r-package/morie/R/ksr04.R +++ b/r-package/morie/R/ksr04.R @@ -8,16 +8,15 @@ #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 2; Vapnik & Chervonenkis (1971). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr04_kosorok_vc_dimension(x = rnorm(50)) #' @export -ksr04_kosorok_vc_dimension <- function(x) { +morie_ksr04_kosorok_vc_dimension <- function(x) { if (is.null(dim(x))) { - d <- 1L; n <- length(x) + d <- 1L + n <- length(x) } else { - n <- nrow(x); d <- ncol(x) + n <- nrow(x) + d <- ncol(x) } list( estimate = as.integer(d + 1L), @@ -27,9 +26,9 @@ ksr04_kosorok_vc_dimension <- function(x) { } # CANONICAL TEST -# ksr04_kosorok_vc_dimension(matrix(0, 100, 3)) +# morie_ksr04_kosorok_vc_dimension(matrix(0, 100, 3)) -#' @rdname ksr04_kosorok_vc_dimension +#' @rdname morie_ksr04_kosorok_vc_dimension #' @keywords internal #' @export -kosorok_vc_dimension <- ksr04_kosorok_vc_dimension +morie_kosorok_vc_dimension <- morie_ksr04_kosorok_vc_dimension diff --git a/r-package/morie/R/ksr05.R b/r-package/morie/R/ksr05.R index 5953be4c2e..8720ce0217 100644 --- a/r-package/morie/R/ksr05.R +++ b/r-package/morie/R/ksr05.R @@ -9,12 +9,9 @@ #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr05_kosorok_bracketing_number(x = rnorm(50)) #' @export -ksr05_kosorok_bracketing_number <- function(x, e = 0.1) { +morie_ksr05_kosorok_bracketing_number <- function(x, e = 0.1) { x <- as.numeric(x) list( estimate = as.integer(ceiling(1 / e^2)), @@ -24,9 +21,9 @@ ksr05_kosorok_bracketing_number <- function(x, e = 0.1) { } # CANONICAL TEST -# ksr05_kosorok_bracketing_number(1:50, 0.1) +# morie_ksr05_kosorok_bracketing_number(1:50, 0.1) -#' @rdname ksr05_kosorok_bracketing_number +#' @rdname morie_ksr05_kosorok_bracketing_number #' @keywords internal #' @export -kosorok_bracketing_number <- ksr05_kosorok_bracketing_number +morie_kosorok_bracketing_number <- morie_ksr05_kosorok_bracketing_number diff --git a/r-package/morie/R/ksr06.R b/r-package/morie/R/ksr06.R index fab9ca0b0c..1aee5401f6 100644 --- a/r-package/morie/R/ksr06.R +++ b/r-package/morie/R/ksr06.R @@ -9,19 +9,18 @@ #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr06_kosorok_maximal_inequality(x = rnorm(50)) #' @export -ksr06_kosorok_maximal_inequality <- function(x) { +morie_ksr06_kosorok_maximal_inequality <- function(x) { x <- as.numeric(x) n <- length(x) sigma_n <- if (n > 1L) stats::sd(x) else NA_real_ theta_n <- 0.5 integrand <- function(e) sqrt(log(2) - 2 * log(e)) - j <- stats::integrate(integrand, lower = 1e-8, upper = theta_n, - subdivisions = 200L)$value + j <- stats::integrate(integrand, + lower = 1e-8, upper = theta_n, + subdivisions = 200L + )$value list( estimate = j * sigma_n, n = n, @@ -30,9 +29,9 @@ ksr06_kosorok_maximal_inequality <- function(x) { } # CANONICAL TEST -# set.seed(0); ksr06_kosorok_maximal_inequality(rnorm(200)) +# set.seed(0); morie_ksr06_kosorok_maximal_inequality(rnorm(200)) -#' @rdname ksr06_kosorok_maximal_inequality +#' @rdname morie_ksr06_kosorok_maximal_inequality #' @keywords internal #' @export -kosorok_maximal_inequality <- ksr06_kosorok_maximal_inequality +morie_kosorok_maximal_inequality <- morie_ksr06_kosorok_maximal_inequality diff --git a/r-package/morie/R/ksr07.R b/r-package/morie/R/ksr07.R index 3f9f2326b3..f9d9958a81 100644 --- a/r-package/morie/R/ksr07.R +++ b/r-package/morie/R/ksr07.R @@ -15,12 +15,9 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 10. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr07_kosorok_bootstrap_empirical(x = rnorm(50)) #' @export -ksr07_kosorok_bootstrap_empirical <- function(x, B = 1000, seed = 0, +morie_ksr07_kosorok_bootstrap_empirical <- function(x, B = 1000, seed = 0, deterministic_seed = NULL) { x <- as.numeric(x) n <- length(x) @@ -41,9 +38,9 @@ ksr07_kosorok_bootstrap_empirical <- function(x, B = 1000, seed = 0, } # CANONICAL TEST -# set.seed(0); ksr07_kosorok_bootstrap_empirical(rnorm(200), B=500, seed=42) +# set.seed(0); morie_ksr07_kosorok_bootstrap_empirical(rnorm(200), B=500, seed=42) -#' @rdname ksr07_kosorok_bootstrap_empirical +#' @rdname morie_ksr07_kosorok_bootstrap_empirical #' @keywords internal #' @export -kosorok_bootstrap_empirical <- ksr07_kosorok_bootstrap_empirical +morie_kosorok_bootstrap_empirical <- morie_ksr07_kosorok_bootstrap_empirical diff --git a/r-package/morie/R/ksr08.R b/r-package/morie/R/ksr08.R index e2dda96236..b8297bbcf1 100644 --- a/r-package/morie/R/ksr08.R +++ b/r-package/morie/R/ksr08.R @@ -14,12 +14,9 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 10. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr08_kosorok_multiplier_bootstrap(x = rnorm(50)) #' @export -ksr08_kosorok_multiplier_bootstrap <- function(x, B = 1000, seed = 0, +morie_ksr08_kosorok_multiplier_bootstrap <- function(x, B = 1000, seed = 0, deterministic_seed = NULL) { x <- as.numeric(x) n <- length(x) @@ -41,9 +38,9 @@ ksr08_kosorok_multiplier_bootstrap <- function(x, B = 1000, seed = 0, } # CANONICAL TEST -# set.seed(0); ksr08_kosorok_multiplier_bootstrap(rnorm(200), B=500, seed=42) +# set.seed(0); morie_ksr08_kosorok_multiplier_bootstrap(rnorm(200), B=500, seed=42) -#' @rdname ksr08_kosorok_multiplier_bootstrap +#' @rdname morie_ksr08_kosorok_multiplier_bootstrap #' @keywords internal #' @export -kosorok_multiplier_bootstrap <- ksr08_kosorok_multiplier_bootstrap +morie_kosorok_multiplier_bootstrap <- morie_ksr08_kosorok_multiplier_bootstrap diff --git a/r-package/morie/R/ksr09.R b/r-package/morie/R/ksr09.R index 4daac9a419..cc66c0e2ef 100644 --- a/r-package/morie/R/ksr09.R +++ b/r-package/morie/R/ksr09.R @@ -10,39 +10,41 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 5. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr09_kosorok_z_estimator(x = rnorm(50)) #' @export -ksr09_kosorok_z_estimator <- function(x, y = NULL) { +morie_ksr09_kosorok_z_estimator <- function(x, y = NULL) { x <- as.numeric(x) if (is.null(y)) { n <- length(x) theta <- mean(x) psi <- x - theta - se <- sqrt(mean(psi^2) / n) - list(estimate = theta, se = se, n = n, - method = "Z-estimator: psi(x;theta) = x - theta") + se <- sqrt(mean(psi^2) / n) + list( + estimate = theta, se = se, n = n, + method = "Z-estimator: psi(x;theta) = x - theta" + ) } else { y <- as.numeric(y) n <- length(x) - xc <- x - mean(x); yc <- y - mean(y) + xc <- x - mean(x) + yc <- y - mean(y) beta <- sum(xc * yc) / sum(xc^2) resid <- yc - beta * xc A <- mean(xc^2) B <- mean((xc^2) * (resid^2)) se <- sqrt(B / (A^2) / n) - list(estimate = beta, se = se, n = n, - method = "Z-estimator: psi(x,y;beta) = x(y - beta x)") + list( + estimate = beta, se = se, n = n, + method = "Z-estimator: psi(x,y;beta) = x(y - beta x)" + ) } } # CANONICAL TEST # set.seed(0); xs <- rnorm(200); ys <- 1.5*xs + rnorm(200) -# ksr09_kosorok_z_estimator(xs, ys) +# morie_ksr09_kosorok_z_estimator(xs, ys) -#' @rdname ksr09_kosorok_z_estimator +#' @rdname morie_ksr09_kosorok_z_estimator #' @keywords internal #' @export -kosorok_z_estimator <- ksr09_kosorok_z_estimator +morie_kosorok_z_estimator <- morie_ksr09_kosorok_z_estimator diff --git a/r-package/morie/R/ksr10.R b/r-package/morie/R/ksr10.R index 201c665fd6..3ab2fdc93b 100644 --- a/r-package/morie/R/ksr10.R +++ b/r-package/morie/R/ksr10.R @@ -12,13 +12,10 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 5; Huber (1981). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr10_kosorok_m_estimator(x = rnorm(50)) #' @export -ksr10_kosorok_m_estimator <- function(x, y = NULL, k = 1.345, - max_iter = 100, tol = 1e-10) { +morie_ksr10_kosorok_m_estimator <- function(x, y = NULL, k = 1.345, + max_iter = 100, tol = 1e-10) { x <- as.numeric(x) n <- length(x) eta <- stats::mad(x, constant = 1) / 0.6745 @@ -26,29 +23,32 @@ ksr10_kosorok_m_estimator <- function(x, y = NULL, k = 1.345, if (eta == 0) eta <- 1 theta <- stats::median(x) for (it in seq_len(max_iter)) { - u <- (x - theta) / eta + u <- (x - theta) / eta psi <- pmin(pmax(u, -k), k) denom <- mean(abs(u) <= k) if (denom == 0) break step <- eta * mean(psi) / denom - if (abs(step) < tol) { theta <- theta + step; break } + if (abs(step) < tol) { + theta <- theta + step + break + } theta <- theta + step } - u <- (x - theta) / eta + u <- (x - theta) / eta psi <- pmin(pmax(u, -k), k) A <- mean(abs(u) <= k) / eta B <- mean(psi^2) se <- sqrt(B / (A^2) / n) list( estimate = theta, se = se, n = n, - method = sprintf("Huber-M location (k=%.3f) with profiled MAD/0.6745 scale", k) + method = sprintf("Huber-M location (k=%.3f) with profiled MAD/0.6745 scale", k) ) } # CANONICAL TEST -# set.seed(0); ksr10_kosorok_m_estimator(rnorm(200)) +# set.seed(0); morie_ksr10_kosorok_m_estimator(rnorm(200)) -#' @rdname ksr10_kosorok_m_estimator +#' @rdname morie_ksr10_kosorok_m_estimator #' @keywords internal #' @export -kosorok_m_estimator <- ksr10_kosorok_m_estimator +morie_kosorok_m_estimator <- morie_ksr10_kosorok_m_estimator diff --git a/r-package/morie/R/ksr11.R b/r-package/morie/R/ksr11.R index 7a2d842224..f077cbfe9d 100644 --- a/r-package/morie/R/ksr11.R +++ b/r-package/morie/R/ksr11.R @@ -10,15 +10,14 @@ #' se (residual sd), n, method. #' @references Kosorok (2008), Ch 6. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr11_kosorok_efficient_score(x = rnorm(50), y = rnorm(50)) #' @export -ksr11_kosorok_efficient_score <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) +morie_ksr11_kosorok_efficient_score <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) n <- length(x) - xc <- x - mean(x); yc <- y - mean(y) + xc <- x - mean(x) + yc <- y - mean(y) beta <- sum(xc * yc) / sum(xc^2) resid <- yc - beta * xc sigma2 <- sum(resid^2) / (n - 2) @@ -33,9 +32,9 @@ ksr11_kosorok_efficient_score <- function(x, y) { # CANONICAL TEST # set.seed(0); xs <- rnorm(200); ys <- 1.5*xs + rnorm(200) -# ksr11_kosorok_efficient_score(xs, ys) +# morie_ksr11_kosorok_efficient_score(xs, ys) -#' @rdname ksr11_kosorok_efficient_score +#' @rdname morie_ksr11_kosorok_efficient_score #' @keywords internal #' @export -kosorok_efficient_score <- ksr11_kosorok_efficient_score +morie_kosorok_efficient_score <- morie_ksr11_kosorok_efficient_score diff --git a/r-package/morie/R/ksr12.R b/r-package/morie/R/ksr12.R index a0d2388e4d..624dd1eda0 100644 --- a/r-package/morie/R/ksr12.R +++ b/r-package/morie/R/ksr12.R @@ -10,15 +10,14 @@ #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 6. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr12_kosorok_information_bound(x = rnorm(50), y = rnorm(50)) #' @export -ksr12_kosorok_information_bound <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) +morie_ksr12_kosorok_information_bound <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) n <- length(x) - xc <- x - mean(x); yc <- y - mean(y) + xc <- x - mean(x) + yc <- y - mean(y) beta <- sum(xc * yc) / sum(xc^2) resid <- yc - beta * xc sigma2 <- sum(resid^2) / (n - 2) @@ -32,9 +31,9 @@ ksr12_kosorok_information_bound <- function(x, y) { # CANONICAL TEST # set.seed(0); xs <- rnorm(200); ys <- 1.5*xs + rnorm(200) -# ksr12_kosorok_information_bound(xs, ys) +# morie_ksr12_kosorok_information_bound(xs, ys) -#' @rdname ksr12_kosorok_information_bound +#' @rdname morie_ksr12_kosorok_information_bound #' @keywords internal #' @export -kosorok_information_bound <- ksr12_kosorok_information_bound +morie_kosorok_information_bound <- morie_ksr12_kosorok_information_bound diff --git a/r-package/morie/R/ksr13.R b/r-package/morie/R/ksr13.R index d36fe6c63b..12ee61429a 100644 --- a/r-package/morie/R/ksr13.R +++ b/r-package/morie/R/ksr13.R @@ -9,12 +9,9 @@ #' @return Named list with estimate (rank), n, method. #' @references Kosorok (2008), Ch 6. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr13_kosorok_tangent_space(x = rnorm(50)) #' @export -ksr13_kosorok_tangent_space <- function(x) { +morie_ksr13_kosorok_tangent_space <- function(x) { x <- as.numeric(x) n <- length(x) s1 <- x - mean(x) @@ -30,9 +27,9 @@ ksr13_kosorok_tangent_space <- function(x) { } # CANONICAL TEST -# set.seed(0); ksr13_kosorok_tangent_space(rnorm(200)) +# set.seed(0); morie_ksr13_kosorok_tangent_space(rnorm(200)) -#' @rdname ksr13_kosorok_tangent_space +#' @rdname morie_ksr13_kosorok_tangent_space #' @keywords internal #' @export -kosorok_tangent_space <- ksr13_kosorok_tangent_space +morie_kosorok_tangent_space <- morie_ksr13_kosorok_tangent_space diff --git a/r-package/morie/R/ksr14.R b/r-package/morie/R/ksr14.R index 8727d3c222..b8454db45f 100644 --- a/r-package/morie/R/ksr14.R +++ b/r-package/morie/R/ksr14.R @@ -10,15 +10,14 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr14_kosorok_profile_likelihood(x = rnorm(50), y = rnorm(50)) #' @export -ksr14_kosorok_profile_likelihood <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) +morie_ksr14_kosorok_profile_likelihood <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) n <- length(x) - xc <- x - mean(x); yc <- y - mean(y) + xc <- x - mean(x) + yc <- y - mean(y) Sxx <- sum(xc^2) beta <- sum(xc * yc) / Sxx resid <- yc - beta * xc @@ -26,15 +25,15 @@ ksr14_kosorok_profile_likelihood <- function(x, y) { se <- sqrt(sigma2 / Sxx) list( estimate = beta, se = se, n = n, - method = "Profile likelihood for OLS slope (eta=sigma^2 profiled)" + method = "Profile likelihood for OLS slope (eta=sigma^2 profiled)" ) } # CANONICAL TEST # set.seed(0); xs <- rnorm(200); ys <- 1.5*xs + rnorm(200) -# ksr14_kosorok_profile_likelihood(xs, ys) +# morie_ksr14_kosorok_profile_likelihood(xs, ys) -#' @rdname ksr14_kosorok_profile_likelihood +#' @rdname morie_ksr14_kosorok_profile_likelihood #' @keywords internal #' @export -kosorok_profile_likelihood <- ksr14_kosorok_profile_likelihood +morie_kosorok_profile_likelihood <- morie_ksr14_kosorok_profile_likelihood diff --git a/r-package/morie/R/ksr15.R b/r-package/morie/R/ksr15.R index fdd1323f95..acee131ed5 100644 --- a/r-package/morie/R/ksr15.R +++ b/r-package/morie/R/ksr15.R @@ -9,12 +9,9 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr15_kosorok_one_step_estimator(x = rnorm(50)) #' @export -ksr15_kosorok_one_step_estimator <- function(x, y = NULL) { +morie_ksr15_kosorok_one_step_estimator <- function(x, y = NULL) { x <- as.numeric(x) n <- length(x) theta_init <- stats::median(x) @@ -23,14 +20,14 @@ ksr15_kosorok_one_step_estimator <- function(x, y = NULL) { se <- stats::sd(x) / sqrt(n) list( estimate = theta_tilde, se = se, n = n, - method = "One-step from median: theta + mean(x-theta)" + method = "One-step from median: theta + mean(x-theta)" ) } # CANONICAL TEST -# set.seed(0); ksr15_kosorok_one_step_estimator(rnorm(200)) +# set.seed(0); morie_ksr15_kosorok_one_step_estimator(rnorm(200)) -#' @rdname ksr15_kosorok_one_step_estimator +#' @rdname morie_ksr15_kosorok_one_step_estimator #' @keywords internal #' @export -kosorok_one_step_estimator <- ksr15_kosorok_one_step_estimator +morie_kosorok_one_step_estimator <- morie_ksr15_kosorok_one_step_estimator diff --git a/r-package/morie/R/ksr16.R b/r-package/morie/R/ksr16.R index 2de4ef3012..67849c240a 100644 --- a/r-package/morie/R/ksr16.R +++ b/r-package/morie/R/ksr16.R @@ -9,15 +9,14 @@ #' @return Named list with estimate, n, method. #' @references Kosorok (2008), Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr16_kosorok_influence_function(x = rnorm(50), y = rnorm(50)) #' @export -ksr16_kosorok_influence_function <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) +morie_ksr16_kosorok_influence_function <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) n <- length(x) - xc <- x - mean(x); yc <- y - mean(y) + xc <- x - mean(x) + yc <- y - mean(y) var_x <- sum(xc^2) / n beta <- sum(xc * yc) / sum(xc^2) resid <- yc - beta * xc @@ -31,9 +30,9 @@ ksr16_kosorok_influence_function <- function(x, y) { # CANONICAL TEST # set.seed(0); xs <- rnorm(200); ys <- 1.5*xs + rnorm(200) -# ksr16_kosorok_influence_function(xs, ys) +# morie_ksr16_kosorok_influence_function(xs, ys) -#' @rdname ksr16_kosorok_influence_function +#' @rdname morie_ksr16_kosorok_influence_function #' @keywords internal #' @export -kosorok_influence_function <- ksr16_kosorok_influence_function +morie_kosorok_influence_function <- morie_ksr16_kosorok_influence_function diff --git a/r-package/morie/R/ksr17.R b/r-package/morie/R/ksr17.R index 7c3736f20d..1473e9ba15 100644 --- a/r-package/morie/R/ksr17.R +++ b/r-package/morie/R/ksr17.R @@ -9,13 +9,11 @@ #' @return Named list with estimate (total events), n, method. #' @references Kosorok (2008), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr17_kosorok_counting_process(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) #' @export -ksr17_kosorok_counting_process <- function(t, event) { - t <- as.numeric(t); event <- as.integer(event) +morie_ksr17_kosorok_counting_process <- function(t, event) { + t <- as.numeric(t) + event <- as.integer(event) list( estimate = as.integer(sum(event)), n = length(t), @@ -24,9 +22,9 @@ ksr17_kosorok_counting_process <- function(t, event) { } # CANONICAL TEST -# ksr17_kosorok_counting_process(1:10, c(1,1,0,1,1,0,1,1,1,0)) +# morie_ksr17_kosorok_counting_process(1:10, c(1,1,0,1,1,0,1,1,1,0)) -#' @rdname ksr17_kosorok_counting_process +#' @rdname morie_ksr17_kosorok_counting_process #' @keywords internal #' @export -kosorok_counting_process <- ksr17_kosorok_counting_process +morie_kosorok_counting_process <- morie_ksr17_kosorok_counting_process diff --git a/r-package/morie/R/ksr18.R b/r-package/morie/R/ksr18.R index ad4eb93b77..91468328c9 100644 --- a/r-package/morie/R/ksr18.R +++ b/r-package/morie/R/ksr18.R @@ -9,17 +9,17 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr18_kosorok_nelson_aalen(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) #' @export -ksr18_kosorok_nelson_aalen <- function(t, event) { - t <- as.numeric(t); event <- as.integer(event) +morie_ksr18_kosorok_nelson_aalen <- function(t, event) { + t <- as.numeric(t) + event <- as.integer(event) n <- length(t) ord <- order(t) - t <- t[ord]; event <- event[ord] - cum_h <- 0; cum_v <- 0 + t <- t[ord] + event <- event[ord] + cum_h <- 0 + cum_v <- 0 i <- 1L while (i <= n) { j <- i @@ -41,9 +41,9 @@ ksr18_kosorok_nelson_aalen <- function(t, event) { } # CANONICAL TEST -# ksr18_kosorok_nelson_aalen(1:10, c(1,1,0,1,1,0,1,1,1,0)) +# morie_ksr18_kosorok_nelson_aalen(1:10, c(1,1,0,1,1,0,1,1,1,0)) -#' @rdname ksr18_kosorok_nelson_aalen +#' @rdname morie_ksr18_kosorok_nelson_aalen #' @keywords internal #' @export -kosorok_nelson_aalen <- ksr18_kosorok_nelson_aalen +morie_kosorok_nelson_aalen <- morie_ksr18_kosorok_nelson_aalen diff --git a/r-package/morie/R/ksr19.R b/r-package/morie/R/ksr19.R index e1c31b8c57..ff7d60b83c 100644 --- a/r-package/morie/R/ksr19.R +++ b/r-package/morie/R/ksr19.R @@ -14,18 +14,22 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr19_kosorok_cox_partial_likelihood( +#' x = rnorm(50), +#' t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8) +#' ) #' @export -ksr19_kosorok_cox_partial_likelihood <- function(x, t, event, - tol = 1e-10, - max_iter = 100) { - x <- as.numeric(x); t <- as.numeric(t); event <- as.integer(event) +morie_ksr19_kosorok_cox_partial_likelihood <- function(x, t, event, + tol = 1e-10, + max_iter = 100) { + x <- as.numeric(x) + t <- as.numeric(t) + event <- as.integer(event) n <- length(x) ord <- order(t) - x <- x[ord]; t <- t[ord]; event <- event[ord] + x <- x[ord] + t <- t[ord] + event <- event[ord] beta <- 0 rev_cumsum <- function(v) rev(cumsum(rev(v))) for (it in seq_len(max_iter)) { @@ -41,7 +45,9 @@ ksr19_kosorok_cox_partial_likelihood <- function(x, t, event, if (abs(step) < tol) break } wt <- exp(beta * x) - S0 <- rev_cumsum(wt); S1 <- rev_cumsum(wt * x); S2 <- rev_cumsum(wt * x * x) + S0 <- rev_cumsum(wt) + S1 <- rev_cumsum(wt * x) + S2 <- rev_cumsum(wt * x * x) info <- sum(event * (S2 / S0 - (S1 / S0)^2)) se <- if (info > 0) sqrt(1 / info) else NA_real_ list( @@ -54,9 +60,9 @@ ksr19_kosorok_cox_partial_likelihood <- function(x, t, event, # CANONICAL TEST # set.seed(0); xs <- rnorm(100); ts <- rexp(100, rate=exp(0.5*xs)) -# ksr19_kosorok_cox_partial_likelihood(xs, ts, rep(1, 100)) +# morie_ksr19_kosorok_cox_partial_likelihood(xs, ts, rep(1, 100)) -#' @rdname ksr19_kosorok_cox_partial_likelihood +#' @rdname morie_ksr19_kosorok_cox_partial_likelihood #' @keywords internal #' @export -kosorok_cox_partial_likelihood <- ksr19_kosorok_cox_partial_likelihood +morie_kosorok_cox_partial_likelihood <- morie_ksr19_kosorok_cox_partial_likelihood diff --git a/r-package/morie/R/ksr20.R b/r-package/morie/R/ksr20.R index 41754d7479..1f27751599 100644 --- a/r-package/morie/R/ksr20.R +++ b/r-package/morie/R/ksr20.R @@ -9,18 +9,18 @@ #' @return Named list with estimate, se, n, method. #' @references Kosorok (2008), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ksr20_kosorok_censoring_survival(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) #' @export -ksr20_kosorok_censoring_survival <- function(t, event) { - t <- as.numeric(t); event <- as.integer(event) +morie_ksr20_kosorok_censoring_survival <- function(t, event) { + t <- as.numeric(t) + event <- as.integer(event) n <- length(t) c_indic <- 1L - event ord <- order(t) - t <- t[ord]; c_indic <- c_indic[ord] - S <- 1; var_factor <- 0 + t <- t[ord] + c_indic <- c_indic[ord] + S <- 1 + var_factor <- 0 i <- 1L while (i <= n) { j <- i @@ -42,9 +42,9 @@ ksr20_kosorok_censoring_survival <- function(t, event) { } # CANONICAL TEST -# ksr20_kosorok_censoring_survival(1:10, c(1,1,0,1,1,0,1,1,1,0)) +# morie_ksr20_kosorok_censoring_survival(1:10, c(1,1,0,1,1,0,1,1,1,0)) -#' @rdname ksr20_kosorok_censoring_survival +#' @rdname morie_ksr20_kosorok_censoring_survival #' @keywords internal #' @export -kosorok_censoring_survival <- ksr20_kosorok_censoring_survival +morie_kosorok_censoring_survival <- morie_ksr20_kosorok_censoring_survival diff --git a/r-package/morie/R/ktaup.R b/r-package/morie/R/ktaup.R index 4f212bf5a9..1258bdb090 100644 --- a/r-package/morie/R/ktaup.R +++ b/r-package/morie/R/ktaup.R @@ -10,28 +10,33 @@ #' tau_xz, tau_yz, z, n. #' @importFrom stats cor pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_kendall_tau_partial(x = rnorm(50), y = rnorm(50), z = rnorm(50)) #' @export -kendall_tau_partial <- function(x, y, z) { - x <- as.numeric(x); y <- as.numeric(y); z <- as.numeric(z) +morie_kendall_tau_partial <- function(x, y, z) { + x <- as.numeric(x) + y <- as.numeric(y) + z <- as.numeric(z) n <- min(length(x), length(y), length(z)) if (n < 4) { - return(list(statistic = NA_real_, p_value = NA_real_, - tau_xy = NA_real_, tau_xz = NA_real_, tau_yz = NA_real_, - n = n, method = "Kendall partial tau")) + return(list( + statistic = NA_real_, p_value = NA_real_, + tau_xy = NA_real_, tau_xz = NA_real_, tau_yz = NA_real_, + n = n, method = "Kendall partial tau" + )) } - x <- x[1:n]; y <- y[1:n]; z <- z[1:n] + x <- x[1:n] + y <- y[1:n] + z <- z[1:n] tau_xy <- stats::cor(x, y, method = "kendall") tau_xz <- stats::cor(x, z, method = "kendall") tau_yz <- stats::cor(y, z, method = "kendall") denom <- sqrt((1 - tau_xz^2) * (1 - tau_yz^2)) if (denom == 0 || !is.finite(denom)) { - return(list(statistic = NA_real_, p_value = NA_real_, - tau_xy = tau_xy, tau_xz = tau_xz, tau_yz = tau_yz, - n = n, method = "Kendall partial tau")) + return(list( + statistic = NA_real_, p_value = NA_real_, + tau_xy = tau_xy, tau_xz = tau_xz, tau_yz = tau_yz, + n = n, method = "Kendall partial tau" + )) } tau_p <- (tau_xy - tau_xz * tau_yz) / denom z_stat <- tau_p * sqrt(9 * n * (n - 1) / (2 * (2 * n + 5))) diff --git a/r-package/morie/R/kvcmp.R b/r-package/morie/R/kvcmp.R index 14a6f365c8..e59edd711b 100644 --- a/r-package/morie/R/kvcmp.R +++ b/r-package/morie/R/kvcmp.R @@ -12,7 +12,8 @@ kv_cache_management <- function(K_cache, V_cache, k_new, v_new, max_len = NULL) { if (is.null(K_cache)) { - K_new <- as.matrix(k_new); V_new <- as.matrix(v_new) + K_new <- as.matrix(k_new) + V_new <- as.matrix(v_new) } else { K_new <- rbind(as.matrix(K_cache), as.matrix(k_new)) V_new <- rbind(as.matrix(V_cache), as.matrix(v_new)) @@ -21,6 +22,8 @@ kv_cache_management <- function(K_cache, V_cache, k_new, v_new, K_new <- K_new[(nrow(K_new) - max_len + 1L):nrow(K_new), , drop = FALSE] V_new <- V_new[(nrow(V_new) - max_len + 1L):nrow(V_new), , drop = FALSE] } - list(K = K_new, V = V_new, T = nrow(K_new), max_len = max_len, - method = "kv-cache-append") + list( + K = K_new, V = V_new, T = nrow(K_new), max_len = max_len, + method = "kv-cache-append" + ) } diff --git a/r-package/morie/R/latnh.R b/r-package/morie/R/latnh.R index 2442f486b1..2bfa47cc3b 100644 --- a/r-package/morie/R/latnh.R +++ b/r-package/morie/R/latnh.R @@ -19,8 +19,10 @@ latnh <- function(N = 100L, d = 1L, f = NULL, seed = 42L) { u_j <- cut + stats::runif(N, 0, 1 / N) sample[, j] <- sample(u_j) } - out <- list(sample = sample, N = as.integer(N), d = as.integer(d), - method = "Latin hypercube (McKay et al. 1979)") + out <- list( + sample = sample, N = as.integer(N), d = as.integer(d), + method = "Latin hypercube (McKay et al. 1979)" + ) if (!is.null(f)) { fv <- apply(sample, 1, f) out$estimate <- mean(fv) @@ -36,4 +38,4 @@ latnh <- function(N = 100L, d = 1L, f = NULL, seed = 42L) { #' @rdname latnh #' @keywords internal #' @export -latin_hypercube <- latnh +morie_latin_hypercube <- latnh diff --git a/r-package/morie/R/license_check.R b/r-package/morie/R/license_check.R index 7f33aa66ee..067d3faab0 100644 --- a/r-package/morie/R/license_check.R +++ b/r-package/morie/R/license_check.R @@ -12,6 +12,12 @@ #' (\code{daemon/morie_lsm.py}) and the kernel companion module #' (\code{kernel-module/morie.c}). #' +#' @return \code{morie_gpl_compatible_licenses()} returns a character vector +#' of GPL-compatible SPDX identifiers; \code{check_plugin_license()} returns +#' a logical (invisibly), signalling a warning or error when the supplied +#' licence is not GPL-compatible. +#' @examples +#' morie_gpl_compatible_licenses() #' @name license_check NULL @@ -25,13 +31,11 @@ NULL #' #' @return Character vector of SPDX identifiers. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_gpl_compatible_licenses() #' @export morie_gpl_compatible_licenses <- function() { - c("GPL-2.0-only", "GPL-2.0-or-later", + c( + "GPL-2.0-only", "GPL-2.0-or-later", "GPL-3.0-only", "GPL-3.0-or-later", "LGPL-2.1-only", "LGPL-2.1-or-later", "LGPL-3.0-only", "LGPL-3.0-or-later", @@ -40,7 +44,8 @@ morie_gpl_compatible_licenses <- function() { "MPL-2.0", "CC0-1.0", "Unlicense", - "Zlib") + "Zlib" + ) } @@ -50,10 +55,7 @@ morie_gpl_compatible_licenses <- function() { #' pipeline build manifests, auditd logs, and downstream #' compliance pipelines. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_license_metadata() #' @export morie_license_metadata <- function() { list( @@ -78,11 +80,14 @@ morie_license_metadata <- function() { #' @examples #' morie_check_plugin_license("MIT") #' \dontrun{ -#' morie_check_plugin_license("LicenseRef-Proprietary", -#' raise_on_incompatible = TRUE) +#' # The next call demonstrates the error path; runs only on +#' # explicit example() with run.dontrun = TRUE. +#' morie_check_plugin_license("LicenseRef-Proprietary", +#' raise_on_incompatible = TRUE +#' ) #' } morie_check_plugin_license <- function(plugin_spdx, - raise_on_incompatible = FALSE) { + raise_on_incompatible = FALSE) { if (is.null(plugin_spdx) || !nzchar(plugin_spdx)) { msg <- "Plugin reports empty SPDX identifier." if (raise_on_incompatible) stop(msg) else warning(msg, call. = FALSE) diff --git a/r-package/morie/R/linrg.R b/r-package/morie/R/linrg.R index e9f5261bc5..09667172cf 100644 --- a/r-package/morie/R/linrg.R +++ b/r-package/morie/R/linrg.R @@ -12,19 +12,18 @@ #' @references #' Hastie, Tibshirani & Friedman, Elements of Statistical Learning (2009). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_linear_regression_ols(x = rnorm(50), y = rnorm(50)) #' @export -linear_regression_ols <- function(x, y) { +morie_linear_regression_ols <- function(x, y) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) - df <- as.data.frame(x); df$.y <- y + x <- as.matrix(x) + y <- as.numeric(y) + df <- as.data.frame(x) + df$.y <- y fit <- stats::lm(.y ~ ., data = df) s <- summary(fit) est <- unname(stats::coef(fit)) - se <- unname(s$coefficients[, "Std. Error"]) + se <- unname(s$coefficients[, "Std. Error"]) list( estimate = est, se = se, diff --git a/r-package/morie/R/longitudinal_sim.R b/r-package/morie/R/longitudinal_sim.R index af72ad1a26..16e9628d1c 100644 --- a/r-package/morie/R/longitudinal_sim.R +++ b/r-package/morie/R/longitudinal_sim.R @@ -17,6 +17,12 @@ #' from Hamilton (1994) and Diggle, Liang, Zeger (1994), implemented #' here independently. #' +#' @return The simulation callables return tidy longitudinal-panel +#' \code{data.frame}s; \code{morie_sync_rng()} returns an environment +#' exposing synchronised \code{rnorm}, \code{runif}, and \code{sample} +#' methods. +#' @examples +#' rng <- morie_sync_rng(42) #' @name longitudinal_sim NULL @@ -32,19 +38,62 @@ NULL #' @return An environment with \code{rnorm}, \code{runif}, \code{sample} #' methods that share the same underlying RNG state. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_sync_rng(seed = 1L) #' @export morie_sync_rng <- function(seed) { - stopifnot(is.numeric(seed), length(seed) == 1L, seed >= 0, - seed == as.integer(seed)) - set.seed(as.integer(seed), kind = "L'Ecuyer-CMRG") + stopifnot( + is.numeric(seed), length(seed) == 1L, seed >= 0, + seed == as.integer(seed) + ) env <- new.env(parent = emptyenv()) - env$rnorm <- function(n, mean = 0, sd = 1) stats::rnorm(n, mean, sd) - env$runif <- function(n, min = 0, max = 1) stats::runif(n, min, max) - env$sample <- function(x, size, replace = FALSE) base::sample(x, size, replace) + + # The synchronised L'Ecuyer-CMRG stream is kept privately in + # `env$.state` and swapped into the global RNG only for the duration + # of a single draw, so morie_sync_rng() never leaves the caller's + # global RNG kind or seed mutated. + .restore <- function(kind, seed_val) { + RNGkind(kind[1L], kind[2L], kind[3L]) + if (is.null(seed_val)) { + if (exists(".Random.seed", globalenv(), inherits = FALSE)) { + rm(".Random.seed", envir = globalenv()) + } + } else { + assign(".Random.seed", seed_val, envir = globalenv()) + } + } + old_kind <- RNGkind() + old_seed <- if (exists(".Random.seed", globalenv(), inherits = FALSE)) { + get(".Random.seed", globalenv()) + } else { + NULL + } + set.seed(as.integer(seed), kind = "L'Ecuyer-CMRG") + env$.state <- get(".Random.seed", globalenv()) + .restore(old_kind, old_seed) + + .draw <- function(fun) { + cur_kind <- RNGkind() + cur_seed <- if (exists(".Random.seed", globalenv(), inherits = FALSE)) { + get(".Random.seed", globalenv()) + } else { + NULL + } + on.exit(.restore(cur_kind, cur_seed), add = TRUE) + RNGkind("L'Ecuyer-CMRG") + assign(".Random.seed", env$.state, envir = globalenv()) + out <- fun() + env$.state <- get(".Random.seed", globalenv()) + out + } + env$rnorm <- function(n, mean = 0, sd = 1) { + .draw(function() stats::rnorm(n, mean, sd)) + } + env$runif <- function(n, min = 0, max = 1) { + .draw(function() stats::runif(n, min, max)) + } + env$sample <- function(x, size, replace = FALSE) { + .draw(function() base::sample(x, size, replace)) + } env } @@ -58,17 +107,15 @@ morie_sync_rng <- function(seed) { #' (1) and full off-diagonal coupling (0). #' @return A p x p numeric matrix A. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export morie_generate_ar_coefficients <- function(p, rng, - spectral_radius = 0.8, - diagonal_bias = 0.4) { + spectral_radius = 0.8, + diagonal_bias = 0.4) { stopifnot(p >= 1, spectral_radius > 0, spectral_radius < 1) A_diag <- diagonal_bias * diag(p) - A_off <- (1 - diagonal_bias) * matrix(rng$rnorm(p * p) * 0.3, p, p) + A_off <- (1 - diagonal_bias) * matrix(rng$rnorm(p * p) * 0.3, p, p) diag(A_off) <- 0 A <- A_diag + A_off rho <- max(Mod(eigen(A, only.values = TRUE)$values)) @@ -86,18 +133,17 @@ morie_generate_ar_coefficients <- function(p, rng, #' @param decay Geometric decay rate of spectral radius across lags. #' @return A list of length \code{lags}, each a p x p matrix. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export morie_generate_var_coefficients <- function(p, lags, rng, - spectral_radius = 0.8, - decay = 0.6) { + spectral_radius = 0.8, + decay = 0.6) { stopifnot(lags >= 1) lapply(seq_len(lags) - 1L, function(l) { morie_generate_ar_coefficients(p, rng, - spectral_radius = spectral_radius * decay^l) + spectral_radius = spectral_radius * decay^l + ) }) } @@ -113,22 +159,22 @@ morie_generate_var_coefficients <- function(p, lags, rng, #' @param mean Optional length-p mean vector. #' @return An n x p matrix of samples. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export morie_mvn_with_covariance <- function(n, p, rng, - kernel = c("ar1", "independent", "compound", "toeplitz"), - rho = 0.5, mean = NULL) { + kernel = c("ar1", "independent", "compound", "toeplitz"), + rho = 0.5, mean = NULL) { kernel <- match.arg(kernel) if (is.null(mean)) mean <- rep(0, p) sigma <- switch(kernel, independent = diag(p), - ar1 = , - toeplitz = rho ^ abs(outer(seq_len(p), seq_len(p), "-")), - compound = (function() { - m <- matrix(rho, p, p); diag(m) <- 1; m + ar1 = , + toeplitz = rho^abs(outer(seq_len(p), seq_len(p), "-")), + compound = (function() { + m <- matrix(rho, p, p) + diag(m) <- 1 + m })() ) L <- chol(sigma) @@ -169,13 +215,15 @@ morie_simulate_longitudinal_panel <- function( seed = 42L ) { rng <- morie_sync_rng(seed) - A <- morie_generate_var_coefficients(p_variables, ar_lags, rng, - spectral_radius = ar_spectral_radius, - decay = ar_decay) + A <- morie_generate_var_coefficients(p_variables, ar_lags, rng, + spectral_radius = ar_spectral_radius, + decay = ar_decay + ) panel <- array(0, dim = c(n_individuals, n_timepoints, p_variables)) for (i in seq_len(n_individuals)) { eps <- morie_mvn_with_covariance(n_timepoints, p_variables, rng, - kernel = cov_kernel, rho = cov_rho) + kernel = cov_kernel, rho = cov_rho + ) history <- vector("list", n_timepoints) for (t in seq_len(n_timepoints)) { x_new <- eps[t, ] @@ -195,10 +243,12 @@ morie_simulate_longitudinal_panel <- function( omask <- array(rng$runif(prod(dim(panel))) < outlier_fraction, dim(panel)) panel[omask] <- panel[omask] * outlier_scale } - out <- expand.grid(subject_id = seq_len(n_individuals) - 1L, - t = seq_len(n_timepoints) - 1L, - variable = seq_len(p_variables) - 1L, - KEEP.OUT.ATTRS = FALSE) + out <- expand.grid( + subject_id = seq_len(n_individuals) - 1L, + t = seq_len(n_timepoints) - 1L, + variable = seq_len(p_variables) - 1L, + KEEP.OUT.ATTRS = FALSE + ) out$value <- as.numeric(panel) out[order(out$subject_id, out$t, out$variable), ] } diff --git a/r-package/morie/R/lradw.R b/r-package/morie/R/lradw.R index 39b3fda272..aaa720e671 100644 --- a/r-package/morie/R/lradw.R +++ b/r-package/morie/R/lradw.R @@ -11,7 +11,9 @@ lr_warmup <- function(x, lr_target = 1e-3, warmup_steps = 1000L) { if (warmup_steps <= 0) stop("warmup_steps must be > 0") t <- as.numeric(x) lr <- lr_target * pmin(1, t / warmup_steps) - list(tensor = lr, value = lr[1L], - lr_target = lr_target, warmup_steps = warmup_steps, - step = t, method = "linear-warmup") + list( + tensor = lr, value = lr[1L], + lr_target = lr_target, warmup_steps = warmup_steps, + step = t, method = "linear-warmup" + ) } diff --git a/r-package/morie/R/lrcvg.R b/r-package/morie/R/lrcvg.R index 89e04982e4..d1e0c6faf4 100644 --- a/r-package/morie/R/lrcvg.R +++ b/r-package/morie/R/lrcvg.R @@ -2,7 +2,7 @@ #' Learning curve -- train / val MSE vs training-set size (R parity) #' -#' Manual implementation of the sklearn learning_curve flow: shuffle, +#' Manual implementation of the sklearn morie_learning_curve flow: shuffle, #' split into k folds, for each train-fraction fit on a prefix of the #' training fold and score on the held-out fold. #' @@ -14,14 +14,12 @@ #' @return Named list: estimate (final val MSE), train_sizes, train_scores, #' val_scores, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_learning_curve(x = rnorm(50), y = rnorm(50)) #' @export -learning_curve <- function(x, y, sizes = NULL, cv = 5L, seed = 0L) { +morie_learning_curve <- function(x, y, sizes = NULL, cv = 5L, seed = 0L) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) + x <- as.matrix(x) + y <- as.numeric(y) n <- nrow(x) if (is.null(sizes)) sizes <- seq(0.1, 1.0, length.out = 5) set.seed(seed) @@ -29,24 +27,27 @@ learning_curve <- function(x, y, sizes = NULL, cv = 5L, seed = 0L) { folds <- cut(seq_len(n), breaks = cv, labels = FALSE) fold_idx <- split(idx, folds) train_scores <- numeric(length(sizes)) - val_scores <- numeric(length(sizes)) - train_sizes <- integer(length(sizes)) + val_scores <- numeric(length(sizes)) + train_sizes <- integer(length(sizes)) for (k in seq_along(sizes)) { - tr_mse <- numeric(cv); va_mse <- numeric(cv) + tr_mse <- numeric(cv) + va_mse <- numeric(cv) for (f in seq_len(cv)) { val_id <- fold_idx[[f]] tr_pool <- setdiff(idx, val_id) m <- max(1L, floor(sizes[k] * length(tr_pool))) tr_id <- tr_pool[seq_len(m)] train_sizes[k] <- m - df_tr <- as.data.frame(x[tr_id, , drop = FALSE]); df_tr$.y <- y[tr_id] - df_va <- as.data.frame(x[val_id, , drop = FALSE]); df_va$.y <- y[val_id] + df_tr <- as.data.frame(x[tr_id, , drop = FALSE]) + df_tr$.y <- y[tr_id] + df_va <- as.data.frame(x[val_id, , drop = FALSE]) + df_va$.y <- y[val_id] fit <- stats::lm(.y ~ ., data = df_tr) tr_mse[f] <- mean((stats::predict(fit, newdata = df_tr) - y[tr_id])^2) va_mse[f] <- mean((stats::predict(fit, newdata = df_va) - y[val_id])^2) } train_scores[k] <- mean(tr_mse) - val_scores[k] <- mean(va_mse) + val_scores[k] <- mean(va_mse) } list( estimate = as.numeric(val_scores[length(val_scores)]), diff --git a/r-package/morie/R/lstmc.R b/r-package/morie/R/lstmc.R index 1ca648a413..b817d5f5ae 100644 --- a/r-package/morie/R/lstmc.R +++ b/r-package/morie/R/lstmc.R @@ -21,20 +21,22 @@ #' @return Named list \code{(h, c, estimate, i, f, g, o, method)}. #' @references Hochreiter & Schmidhuber (1997), Neural Computation 9(8). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_lstmc_lstm_cell(x = rnorm(50)) #' @export -lstmc_lstm_cell <- function(x, h_prev = NULL, c_prev = NULL, +morie_lstmc_lstm_cell <- function(x, h_prev = NULL, c_prev = NULL, W = NULL, U = NULL, b = NULL, hidden_size = NULL, seed = 0L, deterministic_seed = NULL) { - x <- as.numeric(x); n_in <- length(x) + x <- as.numeric(x) + n_in <- length(x) if (is.null(hidden_size)) { - hidden_size <- if (!is.null(h_prev)) length(h_prev) - else if (!is.null(W)) nrow(as.matrix(W)) %/% 4L - else n_in + hidden_size <- if (!is.null(h_prev)) { + length(h_prev) + } else if (!is.null(W)) { + nrow(as.matrix(W)) %/% 4L + } else { + n_in + } } H <- as.integer(hidden_size) if (is.null(h_prev)) h_prev <- rep(0, H) @@ -55,12 +57,14 @@ lstmc_lstm_cell <- function(x, h_prev = NULL, c_prev = NULL, o <- sig(gates[(3 * H + 1L):(4 * H)]) c_new <- f * c_prev + i * g h_new <- o * tanh(c_new) - list(h = h_new, c = c_new, estimate = h_new, - i = i, f = f, g = g, o = o, - method = "LSTM cell forward") + list( + h = h_new, c = c_new, estimate = h_new, + i = i, f = f, g = g, o = o, + method = "LSTM cell forward" + ) } -#' @rdname lstmc_lstm_cell +#' @rdname morie_lstmc_lstm_cell #' @keywords internal #' @export -lstm_cell <- lstmc_lstm_cell +morie_lstm_cell <- morie_lstmc_lstm_cell diff --git a/r-package/morie/R/mandela.R b/r-package/morie/R/mandela.R index dca8432849..fd3acdf8ea 100644 --- a/r-package/morie/R/mandela.R +++ b/r-package/morie/R/mandela.R @@ -62,7 +62,8 @@ #' Sprott, J. B., & Doob, A. N. (2021). Solitary Confinement, Torture, #' and Canada's Structured Intervention Units. Centre for Criminology #' and Sociolegal Studies, University of Toronto. -#' \url{https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf} +#' Available at the Centre for Criminology and Sociolegal Studies +#' web site: crimsl.utoronto.ca (file TortureSolitarySIUsSprottDoob23Feb2021_0.pdf). #' #' Iftene, A., & Doob, A. N. (2024). Do Independent External Decision #' Makers Ensure that "An Inmate's Confinement in a Structured @@ -105,10 +106,10 @@ mrm_classify_mandela <- function( if (broader_rc) { stopifnot(all(alert_cols %in% names(data))) alerts_count <- rowSums( - sapply(alert_cols, function(c) as.integer(data[[c]] > 0)) + vapply(alert_cols, function(c) as.integer(data[[c]] > 0), integer(nrow(data))) ) broader_row <- strict_row | (alerts_count >= 2L & !is.na(dur) & - dur > threshold_days) + dur > threshold_days) } else { broader_row <- strict_row } @@ -145,7 +146,7 @@ mrm_classify_mandela <- function( denom <- length(ids) n_m <- length(ids_m) n_b <- length(ids_b) - } else { # individual_cumulative + } else { # individual_cumulative ids <- unique(sub[[id_col]]) cum_dur <- tapply(dur[mask], sub[[id_col]], sum, na.rm = TRUE) denom <- length(ids) diff --git a/r-package/morie/R/manifest.R b/r-package/morie/R/manifest.R index 0925238924..11f3cf468b 100644 --- a/r-package/morie/R/manifest.R +++ b/r-package/morie/R/manifest.R @@ -8,12 +8,10 @@ escape_regex <- function(x) { #' @param strict If `TRUE`, stop on validation failures. #' @return `TRUE` when validation passes. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -validate_outputs_manifest <- function(manifest, strict = TRUE) { +morie_validate_outputs_manifest <- function(manifest, strict = TRUE) { required <- c("output", "public_path", "size_kb", "modified") fail <- function(msg) { @@ -44,10 +42,29 @@ validate_outputs_manifest <- function(manifest, strict = TRUE) { #' @param manifest_path Optional explicit manifest path. #' @param validate If `TRUE`, validate schema. #' @return Manifest data frame. +#' @examples +#' # Craft a minimal manifest in tempdir and read it back: +#' tdir <- tempfile("morie-doc-") +#' dir.create(tdir) +#' man <- file.path(tdir, "outputs_manifest.csv") +#' write.csv( +#' data.frame( +#' output = "results.csv", +#' public_path = file.path(tdir, "results.csv"), +#' size_kb = 0.01, modified = format(Sys.Date()) +#' ), +#' man, +#' row.names = FALSE +#' ) +#' writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +#' morie_read_outputs_manifest(manifest_path = man) #' @export -read_outputs_manifest <- function(project_root = NULL, manifest_path = NULL, validate = TRUE) { - paths <- morie_paths(project_root) - path <- manifest_path %||% paths$outputs_manifest +morie_read_outputs_manifest <- function(project_root = NULL, manifest_path = NULL, validate = TRUE) { + # When an explicit manifest_path is supplied, do not require a project + # root (morie_find_project_root() fails under R CMD check / covr where the + # working directory is a temporary install tree). `%||%` is lazy, so + # morie_paths() is only consulted when manifest_path is NULL. + path <- manifest_path %||% morie_paths(project_root)$outputs_manifest if (!file.exists(path)) { stop("Outputs manifest not found: ", path, call. = FALSE) @@ -55,7 +72,7 @@ read_outputs_manifest <- function(project_root = NULL, manifest_path = NULL, val manifest <- utils::read.csv(path, stringsAsFactors = FALSE, check.names = FALSE) if (isTRUE(validate)) { - validate_outputs_manifest(manifest, strict = TRUE) + morie_validate_outputs_manifest(manifest, strict = TRUE) } manifest } @@ -68,12 +85,14 @@ read_outputs_manifest <- function(project_root = NULL, manifest_path = NULL, val #' @param extensions File extensions to include (without dots). #' @return Manifest data frame. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # Scan a tempdir of output files and build a manifest CSV: +#' tdir <- tempfile("morie-doc-") +#' dir.create(tdir) +#' writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +#' writeLines("# report", file.path(tdir, "report.md")) +#' morie_build_outputs_manifest(tdir, file.path(tdir, "outputs_manifest.csv")) #' @export -build_outputs_manifest <- function( +morie_build_outputs_manifest <- function( output_dir, manifest_path, public_prefix = "data/manifest/outputs", @@ -123,11 +142,22 @@ build_outputs_manifest <- function( #' @param project_root Project root directory. #' @param manifest Manifest data frame. If `NULL`, loaded from disk. #' @return Data frame containing declared and observed output status. +#' @examples +#' # Craft a tempdir manifest + output file, then audit: +#' tdir <- tempfile("morie-doc-") +#' dir.create(tdir) +#' writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +#' man <- data.frame( +#' output = "results.csv", +#' public_path = file.path(tdir, "results.csv"), +#' size_kb = 0.01, modified = format(Sys.Date()) +#' ) +#' morie_audit_public_outputs(project_root = tdir, manifest = man) #' @export -audit_public_outputs <- function(project_root = NULL, manifest = NULL) { +morie_audit_public_outputs <- function(project_root = NULL, manifest = NULL) { paths <- morie_paths(project_root) - manifest <- manifest %||% read_outputs_manifest(paths$project_root) - validate_outputs_manifest(manifest, strict = TRUE) + manifest <- manifest %||% morie_read_outputs_manifest(paths$project_root) + morie_validate_outputs_manifest(manifest, strict = TRUE) to_abs <- function(p) { ifelse(is_absolute_path(p), p, file.path(paths$project_root, p)) @@ -199,15 +229,13 @@ audit_public_outputs <- function(project_root = NULL, manifest = NULL) { #' Summarize an output audit #' -#' @param audit_tbl Result from [audit_public_outputs()]. +#' @param audit_tbl Result from [morie_audit_public_outputs()]. #' @return Named list with high-level diagnostics. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -summarize_output_audit <- function(audit_tbl) { +morie_summarize_output_audit <- function(audit_tbl) { if (!is.data.frame(audit_tbl)) { stop("`audit_tbl` must be a data.frame.", call. = FALSE) } diff --git a/r-package/morie/R/mbgrd.R b/r-package/morie/R/mbgrd.R index 70119c2364..e3f4edbffa 100644 --- a/r-package/morie/R/mbgrd.R +++ b/r-package/morie/R/mbgrd.R @@ -11,16 +11,15 @@ #' @return Named list: estimate, reference_ols, n_epochs, batch_size, #' loss, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_mini_batch_gradient(x = rnorm(50), y = rnorm(50)) #' @export -mini_batch_gradient <- function(x, y, lr = 0.01, n_epochs = 200, +morie_mini_batch_gradient <- function(x, y, lr = 0.01, n_epochs = 200, batch_size = 32L, seed = 0L) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) - n <- nrow(x); p <- ncol(x) + x <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(x) + p <- ncol(x) X1 <- cbind(1, x) theta <- rep(0.0, p + 1) set.seed(seed) @@ -29,7 +28,8 @@ mini_batch_gradient <- function(x, y, lr = 0.01, n_epochs = 200, starts <- seq.int(1L, n, by = batch_size) for (s in starts) { j <- idx[s:min(s + batch_size - 1L, n)] - xb <- X1[j, , drop = FALSE]; yb <- y[j] + xb <- X1[j, , drop = FALSE] + yb <- y[j] grad <- (2 / length(j)) * crossprod(xb, xb %*% theta - yb) theta <- theta - lr * as.numeric(grad) } diff --git a/r-package/morie/R/mcint.R b/r-package/morie/R/mcint.R index a263f17a37..c394f65ec8 100644 --- a/r-package/morie/R/mcint.R +++ b/r-package/morie/R/mcint.R @@ -17,9 +17,11 @@ mcint_crude <- function(f, a = 0, b = 1, N = 1000L, seed = 42L) { u <- stats::runif(N, a, b) fu <- vapply(u, f, numeric(1)) est <- (b - a) * mean(fu) - se <- (b - a) * stats::sd(fu) / sqrt(N) - list(estimate = as.numeric(est), se = as.numeric(se), - N = as.integer(N), method = "Monte Carlo integration (Rubinstein 1981)") + se <- (b - a) * stats::sd(fu) / sqrt(N) + list( + estimate = as.numeric(est), se = as.numeric(se), + N = as.integer(N), method = "Monte Carlo integration (Rubinstein 1981)" + ) } # CANONICAL TEST @@ -30,4 +32,4 @@ mcint_crude <- function(f, a = 0, b = 1, N = 1000L, seed = 42L) { #' @rdname mcint_crude #' @keywords internal #' @export -monte_carlo_integration <- mcint_crude +morie_monte_carlo_integration <- mcint_crude diff --git a/r-package/morie/R/mdrnk.R b/r-package/morie/R/mdrnk.R index 14b217230c..4358f9013b 100644 --- a/r-package/morie/R/mdrnk.R +++ b/r-package/morie/R/mdrnk.R @@ -8,25 +8,26 @@ #' @param x Numeric vector. #' @return Named list: midranks, n, ties, tie_correction. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' midranks(x = rnorm(50)) #' @export midranks <- function(x) { x <- as.numeric(x) n <- length(x) if (n < 1) { - return(list(midranks = numeric(0), n = 0L, ties = list(), - tie_correction = 0, - method = "Midranks")) + return(list( + midranks = numeric(0), n = 0L, ties = list(), + tie_correction = 0, + method = "Midranks" + )) } mr <- rank(x, ties.method = "average") tab <- table(x) - ties <- Filter(function(z) z[[2]] > 1, Map(function(v, c) list(as.numeric(v), as.integer(c)), - names(tab), tab)) - tie_correction <- sum(sapply(tab[tab > 1], function(c) c^3 - c)) - if (length(tab[tab > 1]) == 0) tie_correction <- 0 + ties <- Filter(function(z) z[[2]] > 1, Map( + function(v, c) list(as.numeric(v), as.integer(c)), + names(tab), tab + )) + tied <- as.numeric(tab[tab > 1]) + tie_correction <- if (length(tied) == 0L) 0 else sum(tied^3 - tied) list( midranks = mr, n = n, diff --git a/r-package/morie/R/mdspl.R b/r-package/morie/R/mdspl.R index c8fccf29a0..a785ec181c 100644 --- a/r-package/morie/R/mdspl.R +++ b/r-package/morie/R/mdspl.R @@ -11,39 +11,42 @@ #' @return Named list with `coords`, `eigenvalues`, `stress`, `k`, #' `n`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' mdspl(x = rnorm(50)) #' @export mdspl <- function(x, k = 2L) { X <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) n <- nrow(X) - if (n < 2L) - return(list(coords = matrix(0, n, k), eigenvalues = rep(0, k), - stress = NA_real_, k = k, n = n, - method = "mds_classical")) + if (n < 2L) { + return(list( + coords = matrix(0, n, k), eigenvalues = rep(0, k), + stress = NA_real_, k = k, n = n, + method = "mds_classical" + )) + } is_dist <- (nrow(X) == ncol(X)) && - isTRUE(all.equal(X, t(X))) && - all(abs(diag(X)) < 1e-9) + isTRUE(all.equal(X, t(X))) && + all(abs(diag(X)) < 1e-9) D <- if (is_dist) X else as.matrix(stats::dist(X)) D2 <- D^2 J <- diag(n) - matrix(1 / n, n, n) B <- -0.5 * J %*% D2 %*% J e <- eigen((B + t(B)) / 2, symmetric = TRUE) - ev <- e$values; vec <- e$vectors + ev <- e$values + vec <- e$vectors k_eff <- min(k, n - 1L) pos <- pmax(ev[seq_len(k_eff)], 0) coords <- vec[, seq_len(k_eff), drop = FALSE] * - matrix(sqrt(pos), n, k_eff, byrow = TRUE) + matrix(sqrt(pos), n, k_eff, byrow = TRUE) Dh <- as.matrix(stats::dist(coords)) denom <- sum(D^2) stress <- if (denom > 0) sqrt(sum((D - Dh)^2) / denom) else NA_real_ - list(coords = coords, eigenvalues = ev, stress = stress, - k = k_eff, n = n, method = "mds_classical") + list( + coords = coords, eigenvalues = ev, stress = stress, + k = k_eff, n = n, method = "mds_classical" + ) } #' @keywords internal #' @rdname mdspl #' @export -mds_spatial_map <- mdspl +morie_mds_spatial_map <- mdspl diff --git a/r-package/morie/R/mdvtr.R b/r-package/morie/R/mdvtr.R index c6a567ed89..3fd788323a 100644 --- a/r-package/morie/R/mdvtr.R +++ b/r-package/morie/R/mdvtr.R @@ -11,28 +11,29 @@ #' `n`, `method`. #' @references Armstrong et al. (2014), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' mdvtr(x = rnorm(50)) #' @export mdvtr <- function(x) { x <- as.numeric(x) x <- x[is.finite(x)] n <- length(x) if (n == 0L) { - return(list(estimate = NA_real_, se = NA_real_, ci_lower = NA_real_, - ci_upper = NA_real_, n = 0L, method = "median_voter")) + return(list( + estimate = NA_real_, se = NA_real_, ci_lower = NA_real_, + ci_upper = NA_real_, n = 0L, method = "morie_median_voter" + )) } est <- stats::median(x) - se <- if (n > 1L) 1.2533141373 * stats::sd(x) / sqrt(n) else NA_real_ + se <- if (n > 1L) 1.2533141373 * stats::sd(x) / sqrt(n) else NA_real_ ci_lo <- if (is.finite(se)) est - 1.96 * se else NA_real_ ci_hi <- if (is.finite(se)) est + 1.96 * se else NA_real_ - list(estimate = est, se = se, ci_lower = ci_lo, ci_upper = ci_hi, - n = n, method = "Median voter theorem") + list( + estimate = est, se = se, ci_lower = ci_lo, ci_upper = ci_hi, + n = n, method = "Median voter theorem" + ) } #' @keywords internal #' @rdname mdvtr #' @export -median_voter <- mdvtr +morie_median_voter <- mdvtr diff --git a/r-package/morie/R/mhatf.R b/r-package/morie/R/mhatf.R index ec1707f175..aea78f5c26 100644 --- a/r-package/morie/R/mhatf.R +++ b/r-package/morie/R/mhatf.R @@ -20,54 +20,66 @@ #' d_model, method)}. #' @references Vaswani et al. (2017), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -mhatf_multi_head_attention_full <- function(x, num_heads = 2L, +morie_mhatf_multi_head_attention_full <- function(x, num_heads = 2L, W_q = NULL, W_k = NULL, W_v = NULL, W_o = NULL, seed = 0L, deterministic_seed = NULL) { x <- as.matrix(x) - seq_len <- nrow(x); d_model <- ncol(x) - if (d_model %% num_heads != 0L) - stop(sprintf("d_model=%d must be divisible by num_heads=%d", - d_model, num_heads)) + seq_len <- nrow(x) + d_model <- ncol(x) + if (d_model %% num_heads != 0L) { + stop(sprintf( + "d_model=%d must be divisible by num_heads=%d", + d_model, num_heads + )) + } d_k <- d_model %/% num_heads if (!is.null(deterministic_seed)) { morie_det_rng("mhatf", deterministic_seed) } else { set.seed(seed) } - rn <- function() matrix(stats::rnorm(d_model * d_model, 0, 1 / sqrt(d_model)), - d_model, d_model) + rn <- function() { + matrix( + stats::rnorm(d_model * d_model, 0, 1 / sqrt(d_model)), + d_model, d_model + ) + } if (is.null(W_q)) W_q <- rn() if (is.null(W_k)) W_k <- rn() if (is.null(W_v)) W_v <- rn() if (is.null(W_o)) W_o <- diag(d_model) - Q <- x %*% W_q; K <- x %*% W_k; V <- x %*% W_v + Q <- x %*% W_q + K <- x %*% W_k + V <- x %*% W_v head_outputs <- vector("list", num_heads) head_attns <- vector("list", num_heads) for (h in seq_len(num_heads)) { cols <- ((h - 1L) * d_k + 1L):(h * d_k) - res <- attnq_scaled_dot_product_attention(Q[, cols, drop = FALSE], - K[, cols, drop = FALSE], - V[, cols, drop = FALSE]) + res <- morie_attnq_scaled_dot_product_attention( + Q[, cols, drop = FALSE], + K[, cols, drop = FALSE], + V[, cols, drop = FALSE] + ) head_outputs[[h]] <- res$output head_attns[[h]] <- res$attn } concat <- do.call(cbind, head_outputs) out <- concat %*% W_o - list(output = out, estimate = out, heads = head_attns, - num_heads = as.integer(num_heads), - d_k = as.integer(d_k), d_model = as.integer(d_model), - method = "Multi-head attention") + list( + output = out, estimate = out, heads = head_attns, + num_heads = as.integer(num_heads), + d_k = as.integer(d_k), d_model = as.integer(d_model), + method = "Multi-head attention" + ) } -#' @rdname mhatf_multi_head_attention_full +#' @rdname morie_mhatf_multi_head_attention_full #' @keywords internal #' @export -multi_head_attention_full <- mhatf_multi_head_attention_full +morie_multi_head_attention_full <- morie_mhatf_multi_head_attention_full diff --git a/r-package/morie/R/midas.R b/r-package/morie/R/midas.R index 4d90cc1e7f..d1442c50a6 100644 --- a/r-package/morie/R/midas.R +++ b/r-package/morie/R/midas.R @@ -1,5 +1,23 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: MIDAS sum-of-squared-errors objective. Extracted from the +# morie_midas_regression() optimiser closure so the theta-domain guard and the +# non-finite-SSE guard are directly unit-testable. `X` is the lag-matrix, +# `Y` the target, `K` the number of high-frequency lags. +.midas_sse <- function(p, X, Y, K) { + b0 <- p[1] + b1 <- p[2] + t1 <- p[3] + t2 <- p[4] + if (t1 <= 0 || t2 <= 0) { + return(1e10) + } + w <- .morie_beta_weights(t1, t2, K) + yhat <- b0 + b1 * (X %*% w) + sse <- sum((Y - yhat)^2) + if (!is.finite(sse)) 1e10 else sse +} + #' MIDAS regression with Beta-polynomial weights #' #' @param x High-frequency regressor matrix (n_t x K) or flat vector. @@ -8,13 +26,12 @@ #' @return Named list with \code{beta0, beta1, theta1, theta2, weights, #' r2, n, K, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -midas_regression <- function(x, y, K = NULL) { - Y <- as.numeric(y); nT <- length(Y) +morie_midas_regression <- function(x, y, K = NULL) { + Y <- as.numeric(y) + nT <- length(Y) if (is.null(dim(x))) { if (is.null(K)) stop("Pass K when x is flat.") if (length(x) < K + nT - 1) stop("x too short.") @@ -26,28 +43,27 @@ midas_regression <- function(x, y, K = NULL) { } X <- do.call(rbind, rows) } else { - X <- as.matrix(x); K <- ncol(X) + X <- as.matrix(x) + K <- ncol(X) } if (nrow(X) != nT) stop("Dim mismatch.") if (nT < 4) stop("Need >=4 obs.") - neg_ll <- function(p) { - b0 <- p[1]; b1 <- p[2]; t1 <- p[3]; t2 <- p[4] - if (t1 <= 0 || t2 <= 0) return(1e10) - w <- .morie_beta_weights(t1, t2, K) - yhat <- b0 + b1 * (X %*% w) - sse <- sum((Y - yhat)^2) - if (!is.finite(sse)) 1e10 else sse - } + neg_ll <- function(p) .midas_sse(p, X, Y, K) opt <- nlminb(c(mean(Y), 1, 1.5, 2), neg_ll, - lower = c(-1e3, -1e3, 0.1, 0.1), - upper = c( 1e3, 1e3, 50, 50)) - b0 <- opt$par[1]; b1 <- opt$par[2] - t1 <- opt$par[3]; t2 <- opt$par[4] + lower = c(-1e3, -1e3, 0.1, 0.1), + upper = c(1e3, 1e3, 50, 50) + ) + b0 <- opt$par[1] + b1 <- opt$par[2] + t1 <- opt$par[3] + t2 <- opt$par[4] w <- .morie_beta_weights(t1, t2, K) resid <- Y - (b0 + b1 * (X %*% w)) ss_tot <- sum((Y - mean(Y))^2) r2 <- if (ss_tot > 0) 1 - sum(resid^2) / ss_tot else NA_real_ - list(beta0 = b0, beta1 = b1, theta1 = t1, theta2 = t2, - weights = w, r2 = r2, n = nT, K = K, - method = "MIDAS Beta-polynomial via nlminb (base R)") + list( + beta0 = b0, beta1 = b1, theta1 = t1, theta2 = t2, + weights = w, r2 = r2, n = nT, K = K, + method = "MIDAS Beta-polynomial via nlminb (base R)" + ) } diff --git a/r-package/morie/R/mnpbt.R b/r-package/morie/R/mnpbt.R index c51f03e298..0e14b0c2f1 100644 --- a/r-package/morie/R/mnpbt.R +++ b/r-package/morie/R/mnpbt.R @@ -12,33 +12,36 @@ #' @return Named list with `probs`, `max_alt`, `n_obs`, `n_alt`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' mnpbt(x = rnorm(50)) #' @export mnpbt <- function(x, n_draws = 2000L, seed = 0L) { U <- if (is.matrix(x)) x else matrix(as.numeric(x), nrow = 1L) - n <- nrow(U); J <- ncol(U) - if (J < 2L) - return(list(probs = matrix(1, n, J), - max_alt = rep(1L, n), n_obs = n, n_alt = J, - method = "multinomial_probit")) + n <- nrow(U) + J <- ncol(U) + if (J < 2L) { + return(list( + probs = matrix(1, n, J), + max_alt = rep(1L, n), n_obs = n, n_alt = J, + method = "multinomial_probit" + )) + } set.seed(seed) draws <- array(stats::rnorm(n_draws * n * J), dim = c(n_draws, n, J)) Y <- sweep(draws, c(2L, 3L), U, FUN = "+") - picks <- apply(Y, c(1L, 2L), which.max) # n_draws by n + picks <- apply(Y, c(1L, 2L), which.max) # n_draws by n probs <- matrix(0, n, J) for (j in seq_len(J)) probs[, j] <- colMeans(picks == j) if (J == 2L) { probs[, 2] <- stats::pnorm((U[, 2] - U[, 1]) / sqrt(2)) probs[, 1] <- 1 - probs[, 2] } - list(probs = probs, max_alt = max.col(probs), - n_obs = n, n_alt = J, method = "multinomial_probit") + list( + probs = probs, max_alt = max.col(probs), + n_obs = n, n_alt = J, method = "multinomial_probit" + ) } #' @keywords internal #' @rdname mnpbt #' @export -multinomial_probit_spatial <- mnpbt +morie_multinomial_probit_spatial <- mnpbt diff --git a/r-package/morie/R/modules.R b/r-package/morie/R/modules.R index 3844e8ba70..40d8f818d8 100644 --- a/r-package/morie/R/modules.R +++ b/r-package/morie/R/modules.R @@ -2,12 +2,9 @@ #' #' @return Data frame describing the implemented module surface. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_list_morie_modules() #' @export -list_morie_modules <- function() { +morie_list_morie_modules <- function() { data.frame( name = c( "data-wrangling", @@ -65,7 +62,9 @@ list_morie_modules <- function() { "data/datasets/oc/CPADS/2021-2022/cpads-2021-2022-pumf2.csv" ) for (p in candidates) { - if (file.exists(p)) return(p) + if (file.exists(p)) { + return(p) + } } # Fallback: first candidate (will be resolved by .resolve_cpads_csv). candidates[1L] @@ -95,19 +94,17 @@ list_morie_modules <- function() { #' @param data Raw CPADS data frame. #' @return Data frame with canonical MORIE analysis columns. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -canonicalize_cpads_data <- function(data) { +morie_canonicalize_cpads_data <- function(data) { required_raw <- c( "wtpumf", "alc05", "alc12_30d_prev_total", "alc12_30d_prev", "can05", "age_groups", "dvdemq01", "region", "hwbq01", "hwbq02", "ebac_tot", "ebac_legal" ) missing_raw <- setdiff(required_raw, names(data)) if (length(missing_raw) > 0) { - validate_cpads_data(data, strict = TRUE) + morie_validate_cpads_data(data, strict = TRUE) return(data) } @@ -136,7 +133,7 @@ canonicalize_cpads_data <- function(data) { for (nm in names(canonical)) { out[[nm]] <- canonical[[nm]] } - validate_cpads_data(out, strict = TRUE) + morie_validate_cpads_data(out, strict = TRUE) out } @@ -144,10 +141,18 @@ canonicalize_cpads_data <- function(data) { #' #' @param cpads_csv Path to the CPADS CSV. #' @return Canonicalized CPADS data frame. +#' @examples +#' \donttest{ +#' # Reads and canonicalises the CPADS PUMF CSV. The default CSV lives in +#' # a morie project tree; the CKAN-fetched PUMF works identically (see +#' # morie_load_dataset("ocp21")). The tryCatch guard lets the example +#' # render cleanly on machines without the CSV checked out locally. +#' tryCatch(morie_load_cpads_data(), error = function(e) message(conditionMessage(e))) +#' } #' @export -load_cpads_data <- function(cpads_csv = .cpads_default_csv()) { +morie_load_cpads_data <- function(cpads_csv = .cpads_default_csv()) { cpads_csv <- .resolve_cpads_csv(cpads_csv) - canonicalize_cpads_data(utils::read.csv(cpads_csv, stringsAsFactors = FALSE)) + morie_canonicalize_cpads_data(utils::read.csv(cpads_csv, stringsAsFactors = FALSE)) } .write_module_outputs <- function(outputs, output_dir = NULL) { @@ -175,16 +180,21 @@ load_cpads_data <- function(cpads_csv = .cpads_default_csv()) { #' @param output_dir Optional directory for CSV outputs. #' @return Named list of data-frame outputs. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") +#' \donttest{ +#' # Dispatch one MORIE module against the canonical CPADS CSV. The CSV +#' # ships with a morie project tree, or is fetched via the CKAN endpoint +#' # (morie_load_dataset("ocp21")). Wrapped in tryCatch so the example +#' # documents usage even when the CSV is not checked out locally. +#' tryCatch( +#' morie_run_morie_module("descriptive-statistics"), +#' error = function(e) message(conditionMessage(e)) +#' ) #' } #' @export -run_morie_module <- function(module_name, cpads_csv = .cpads_default_csv(), output_dir = NULL) { - data <- load_cpads_data(cpads_csv) +morie_run_morie_module <- function(module_name, cpads_csv = .cpads_default_csv(), output_dir = NULL) { + data <- morie_load_cpads_data(cpads_csv) - outputs <- switch( - module_name, + outputs <- switch(module_name, "data-wrangling" = .run_data_wrangling_module_internal(data, cpads_csv = cpads_csv, output_dir = output_dir), "descriptive-statistics" = .run_descriptive_statistics_module_internal(data), "distribution-tests" = .run_distribution_tests_module_internal(data), @@ -220,18 +230,16 @@ run_morie_module <- function(module_name, cpads_csv = .cpads_default_csv(), outp #' @param output_dir Optional directory for CSV outputs. #' @return Named list of module outputs. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -run_morie_modules <- function( - modules = list_morie_modules()$name, +morie_run_morie_modules <- function( + modules = morie_list_morie_modules()$name, cpads_csv = .cpads_default_csv(), output_dir = NULL ) { stats::setNames( - lapply(modules, function(m) run_morie_module(m, cpads_csv = cpads_csv, output_dir = output_dir)), + lapply(modules, function(m) morie_run_morie_module(m, cpads_csv = cpads_csv, output_dir = output_dir)), modules ) } diff --git a/r-package/morie/R/moeml.R b/r-package/morie/R/moeml.R index 4238904673..7d91417889 100644 --- a/r-package/morie/R/moeml.R +++ b/r-package/morie/R/moeml.R @@ -9,36 +9,48 @@ #' @return Named list with tensor, gate, topk_idx, load, method. #' @keywords internal mixture_of_experts <- function(x, W_gate = NULL, experts = NULL, - top_k = 2L) { + top_k = 2L) { xm <- as.matrix(x) - B <- nrow(xm); d_in <- ncol(xm) + B <- nrow(xm) + d_in <- ncol(xm) if (is.null(W_gate)) { - n_experts <- 2L; W_gate <- matrix(0, d_in, n_experts) + n_experts <- 2L + W_gate <- matrix(0, d_in, n_experts) } n_experts <- ncol(W_gate) if (is.null(experts)) { experts <- replicate(n_experts, - list(W = diag(d_in), b = rep(0, d_in)), - simplify = FALSE) + list(W = diag(d_in), b = rep(0, d_in)), + simplify = FALSE + ) } gate_logits <- xm %*% W_gate gate <- t(apply(gate_logits, 1L, function(v) { - v <- v - max(v); e <- exp(v); e / sum(e) + v <- v - max(v) + e <- exp(v) + e / sum(e) })) k <- max(1L, min(as.integer(top_k), n_experts)) - topk_idx <- t(apply(gate, 1L, function(g) - order(-g)[seq_len(k)])) + topk_idx <- matrix(0L, B, k) + for (b in seq_len(B)) { + topk_idx[b, ] <- order(-gate[b, ])[seq_len(k)] + } sparse <- matrix(0, B, n_experts) - for (b in seq_len(B)) + for (b in seq_len(B)) { sparse[b, topk_idx[b, ]] <- gate[b, topk_idx[b, ]] + } sparse <- sparse / rowSums(sparse) - expert_outs <- lapply(experts, function(e) - sweep(xm %*% e$W, 2L, e$b, "+")) + expert_outs <- lapply(experts, function(e) { + sweep(xm %*% e$W, 2L, e$b, "+") + }) d_out <- ncol(expert_outs[[1L]]) y <- matrix(0, B, d_out) - for (e in seq_len(n_experts)) + for (e in seq_len(n_experts)) { y <- y + sweep(expert_outs[[e]], 1L, sparse[, e], "*") + } load <- colSums(sparse) / B - list(tensor = y, gate = sparse, - topk_idx = topk_idx - 1L, load = load, method = "MoE") + list( + tensor = y, gate = sparse, + topk_idx = topk_idx - 1L, load = load, method = "MoE" + ) } diff --git a/r-package/morie/R/morie-package.R b/r-package/morie/R/morie-package.R index f52eb50bf7..73c4e79f7c 100644 --- a/r-package/morie/R/morie-package.R +++ b/r-package/morie/R/morie-package.R @@ -60,7 +60,9 @@ #' \emph{Solitary Confinement, Torture, and Canada's Structured #' Intervention Units.} Centre for Criminology and Sociolegal #' Studies, University of Toronto. -#' \url{https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf} +#' Available at the Centre for Criminology and Sociolegal +#' Studies web site: crimsl.utoronto.ca (file +#' TortureSolitarySIUsSprottDoob23Feb2021_0.pdf). #' \item Doob, A. N. and Sprott, J. B. (2020). #' \emph{Understanding the Operation of Correctional Service #' Canada's Structured Intervention Units: Some Preliminary @@ -91,6 +93,8 @@ #' @aliases morie-package #' @importFrom stats aggregate ave deviance median na.omit plogis #' setNames update weighted.mean +#' @importFrom Rcpp sourceCpp +#' @useDynLib morie, .registration = TRUE "_PACKAGE" # `weight` is referenced as a data-frame column name inside diff --git a/r-package/morie/R/mrkvr.R b/r-package/morie/R/mrkvr.R index 5e6317d0c6..744f823724 100644 --- a/r-package/morie/R/mrkvr.R +++ b/r-package/morie/R/mrkvr.R @@ -13,43 +13,57 @@ #' sigma_m2_naive, sum_2pq, p_freq, n, p, method). #' @references VanRaden (2008); Montesinos Lopez Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_marker_variance( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -marker_variance <- function(x, y, markers) { - y <- as.numeric(y); n <- length(y) - M <- as.matrix(markers); m <- ncol(M) - cand <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) matrix(1, n, 1) - else cbind(1, as.matrix(x)) - qrx <- qr(cand); X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] +morie_marker_variance <- function(x, y, markers) { + y <- as.numeric(y) + n <- length(y) + M <- as.matrix(markers) + m <- ncol(M) + cand <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { + matrix(1, n, 1) + } else { + cbind(1, as.matrix(x)) + } + qrx <- qr(cand) + X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] p_freq <- colMeans(M) / 2 sum_2pq <- 2 * sum(p_freq * (1 - p_freq)) if (sum_2pq <= 0) sum_2pq <- 1 - G <- grm_vanraden(M, method = 1)$estimate + 1e-6 * diag(n) + G <- morie_grm_vanraden(M, method = 1)$estimate + 1e-6 * diag(n) Ginv <- solve(G) lam <- 1 px <- ncol(X) - C <- rbind(cbind(crossprod(X), t(X)), - cbind(X, diag(n) + lam * Ginv)) + C <- rbind( + cbind(crossprod(X), t(X)), + cbind(X, diag(n) + lam * Ginv) + ) rhs <- c(crossprod(X, y), y) sol <- solve(C, rhs) - beta <- sol[seq_len(px)]; g_hat <- sol[(px + 1):(px + n)] + beta <- sol[seq_len(px)] + g_hat <- sol[(px + 1):(px + n)] sigma_g2 <- if (n > 1) stats::var(g_hat) else 0 resid <- y - X %*% beta - g_hat sigma_e2 <- if (n > 1) stats::var(as.numeric(resid)) else 0 sigma_m2_v <- sigma_g2 / sum_2pq sigma_m2_n <- sigma_g2 / max(m, 1) - h2 <- if (sigma_g2 + sigma_e2 > 0) sigma_g2 / (sigma_g2 + sigma_e2) - else NA_real_ - list(estimate = sigma_m2_v, sigma_g2 = sigma_g2, sigma_e2 = sigma_e2, - h2 = h2, sigma_m2_vanraden = sigma_m2_v, sigma_m2_naive = sigma_m2_n, - sum_2pq = sum_2pq, p_freq = p_freq, n = n, p = m, - method = "VanRaden + naive marker-variance split") + h2 <- if (sigma_g2 + sigma_e2 > 0) { + sigma_g2 / (sigma_g2 + sigma_e2) + } else { + NA_real_ + } + list( + estimate = sigma_m2_v, sigma_g2 = sigma_g2, sigma_e2 = sigma_e2, + h2 = h2, sigma_m2_vanraden = sigma_m2_v, sigma_m2_naive = sigma_m2_n, + sum_2pq = sum_2pq, p_freq = p_freq, n = n, p = m, + method = "VanRaden + naive marker-variance split" + ) } # CANONICAL TEST # set.seed(16); M <- matrix(sample(0:2, 160, TRUE), 20, 8) # y <- M %*% rnorm(8) + 0.5*rnorm(20) -# marker_variance(rep(0, 20), y, M)$h2 +# morie_marker_variance(rep(0, 20), y, M)$h2 diff --git a/r-package/morie/R/mrm_design.R b/r-package/morie/R/mrm_design.R index 161f320347..2f7bbf5b68 100644 --- a/r-package/morie/R/mrm_design.R +++ b/r-package/morie/R/mrm_design.R @@ -13,6 +13,13 @@ #' Box, G. E. P., Hunter, J. S., & Hunter, W. G. (2005). #' Statistics for Experimenters. Wiley. #' +#' @return Each design callable returns a named \code{list} of estimates, +#' test statistics, p-values, and a plain-language \code{interpretation}. +#' @examples +#' set.seed(2026) +#' a <- rnorm(40, mean = 5, sd = 1.2) +#' b <- rnorm(40, mean = 5.5, sd = 1.5) +#' mrm_two_treatment_test(a, b)$p_welch #' @name mrm_design NULL @@ -31,37 +38,41 @@ NULL #' n_a, n_b, interpretation. #' @examples #' set.seed(2026) -#' a <- rnorm(40, mean = 5, sd = 1.2) +#' a <- rnorm(40, mean = 5, sd = 1.2) #' b <- rnorm(40, mean = 5.5, sd = 1.5) #' res <- mrm_two_treatment_test(a, b) -#' res$estimate # mean(a) - mean(b) -#' res$p_welch # canonical p-value +#' res$estimate # mean(a) - mean(b) +#' res$p_welch # canonical p-value #' res$p_mannwhitney # rank-based sensitivity check #' @export mrm_two_treatment_test <- function(a, b, alpha = 0.05) { - a <- as.numeric(a); b <- as.numeric(b) - a <- a[is.finite(a)]; b <- b[is.finite(b)] + a <- as.numeric(a) + b <- as.numeric(b) + a <- a[is.finite(a)] + b <- b[is.finite(b)] welch <- stats::t.test(a, b, var.equal = FALSE) - stud <- stats::t.test(a, b, var.equal = TRUE) - mw <- suppressWarnings(stats::wilcox.test(a, b, alternative = "two.sided")) - diff <- mean(a) - mean(b) - se <- sqrt(stats::var(a) / length(a) + stats::var(b) / length(b)) - df_ <- welch$parameter - z <- stats::qt(1 - alpha / 2, df_) + stud <- stats::t.test(a, b, var.equal = TRUE) + mw <- suppressWarnings(stats::wilcox.test(a, b, alternative = "two.sided")) + diff <- mean(a) - mean(b) + se <- sqrt(stats::var(a) / length(a) + stats::var(b) / length(b)) + df_ <- welch$parameter + z <- stats::qt(1 - alpha / 2, df_) list( - estimate = round(diff, 6), - se = round(se, 6), - t_statistic = round(as.numeric(welch$statistic), 4), - df = round(as.numeric(df_), 2), - p_welch = welch$p.value, - p_student = stud$p.value, - p_mannwhitney= mw$p.value, - ci_lower = round(diff - z * se, 6), - ci_upper = round(diff + z * se, 6), - n_a = length(a), n_b = length(b), - interpretation = sprintf("Welch t: Delta=%.3f, p=%.3g; %s", - diff, welch$p.value, - if (welch$p.value < alpha) "reject H0" else "fail to reject") + estimate = round(diff, 6), + se = round(se, 6), + t_statistic = round(as.numeric(welch$statistic), 4), + df = round(as.numeric(df_), 2), + p_welch = welch$p.value, + p_student = stud$p.value, + p_mannwhitney = mw$p.value, + ci_lower = round(diff - z * se, 6), + ci_upper = round(diff + z * se, 6), + n_a = length(a), n_b = length(b), + interpretation = sprintf( + "Welch t: Delta=%.3f, p=%.3g; %s", + diff, welch$p.value, + if (welch$p.value < alpha) "reject H0" else "fail to reject" + ) ) } @@ -92,23 +103,27 @@ mrm_anova_oneway <- function(data, response_col, group_col, alpha = 0.05) { d[[group_col]] <- factor(d[[group_col]]) fit <- stats::aov(as.formula(paste(response_col, "~", group_col)), data = d) summ <- summary(fit)[[1]] - f <- summ[["F value"]][1]; p <- summ[["Pr(>F)"]][1] + f <- summ[["F value"]][1] + p <- summ[["Pr(>F)"]][1] tk <- stats::TukeyHSD(fit, conf.level = 1 - alpha) tk_df <- as.data.frame(tk[[group_col]]) - tk_df$pair <- rownames(tk_df); rownames(tk_df) <- NULL + tk_df$pair <- rownames(tk_df) + rownames(tk_df) <- NULL means <- tapply(d[[response_col]], d[[group_col]], mean) - ns <- tapply(d[[response_col]], d[[group_col]], length) + ns <- tapply(d[[response_col]], d[[group_col]], length) list( f_statistic = round(as.numeric(f), 4), - p_value = as.numeric(p), - df_between = summ[["Df"]][1], - df_within = summ[["Df"]][2], - means = as.list(round(means, 4)), + p_value = as.numeric(p), + df_between = summ[["Df"]][1], + df_within = summ[["Df"]][2], + means = as.list(round(means, 4)), n_per_group = as.list(ns), - tukey_hsd = tk_df, - interpretation = sprintf("F(%d,%d) = %.3f, p = %.3g%s", - summ[["Df"]][1], summ[["Df"]][2], f, p, - if (p < alpha) "; reject H0 of equal means" else "") + tukey_hsd = tk_df, + interpretation = sprintf( + "F(%d,%d) = %.3f, p = %.3g%s", + summ[["Df"]][1], summ[["Df"]][2], f, p, + if (p < alpha) "; reject H0 of equal means" else "" + ) ) } @@ -133,8 +148,10 @@ mrm_anova_oneway <- function(data, response_col, group_col, alpha = 0.05) { #' lvl <- c(-1, 1) #' df <- expand.grid(A = lvl, B = lvl, C = lvl) #' df$y <- 10 + 2 * df$A + 1.5 * df$B + 0.5 * df$A * df$B + rnorm(8, 0, 0.2) -#' res <- mrm_factorial_2k(df, response_col = "y", -#' factor_cols = c("A", "B", "C")) +#' res <- mrm_factorial_2k(df, +#' response_col = "y", +#' factor_cols = c("A", "B", "C") +#' ) #' res$main_effects #' res$interaction_effects #' @export @@ -184,10 +201,12 @@ mrm_factorial_2k <- function(data, response_col, factor_cols) { interaction_effects = lapply(inter, function(x) round(x, 6)), half_normal_coords = half_n, n = length(y), k = k, - interpretation = sprintf("2^%d factorial on n=%d. Largest main effect: %s = %.3f", - k, length(y), - names(main)[which.max(abs(main))], - main[which.max(abs(main))]) + interpretation = sprintf( + "2^%d factorial on n=%d. Largest main effect: %s = %.3f", + k, length(y), + names(main)[which.max(abs(main))], + main[which.max(abs(main))] + ) ) } @@ -212,14 +231,18 @@ mrm_factorial_2k <- function(data, response_col, factor_cols) { #' y <- 0.7 * D + 0.3 * x + rnorm(n, 0, 0.5) #' df <- data.frame(D = D, y = y, age = x) #' # IPW-adjusted ATE -#' ipw <- mrm_causal_design(df, treatment_col = "D", -#' outcome_col = "y", -#' covariates = "age", -#' estimator = "ipw") +#' ipw <- mrm_causal_design(df, +#' treatment_col = "D", +#' outcome_col = "y", +#' covariates = "age", +#' estimator = "ipw" +#' ) #' # Naive difference in means for comparison -#' raw <- mrm_causal_design(df, treatment_col = "D", -#' outcome_col = "y", -#' estimator = "diff_in_means") +#' raw <- mrm_causal_design(df, +#' treatment_col = "D", +#' outcome_col = "y", +#' estimator = "diff_in_means" +#' ) #' c(ipw = ipw$estimate, raw = raw$estimate) #' @importFrom stats as.formula #' @export @@ -234,35 +257,44 @@ mrm_causal_design <- function( d <- d[stats::complete.cases(d), , drop = FALSE] D <- as.integer(d[[treatment_col]]) Y <- as.numeric(d[[outcome_col]]) - n <- nrow(d); n_t <- sum(D) + n <- nrow(d) + n_t <- sum(D) if (estimator == "ipw" && length(covariates) > 0L) { fml <- as.formula(paste(treatment_col, "~", paste(covariates, collapse = "+"))) fit <- stats::glm(fml, data = d, family = stats::binomial()) - e <- stats::predict(fit, type = "response") - e <- pmax(pmin(e, 1 - 1e-6), 1e-6) - w1 <- D / e; w0 <- (1 - D) / (1 - e) + e <- stats::predict(fit, type = "response") + e <- pmax(pmin(e, 1 - 1e-6), 1e-6) + w1 <- D / e + w0 <- (1 - D) / (1 - e) tau <- sum(w1 * Y) / sum(w1) - sum(w0 * Y) / sum(w0) # bootstrap SE set.seed(42) boots <- replicate(199, { idx <- sample.int(n, replace = TRUE) sub <- d[idx, , drop = FALSE] - e_b <- tryCatch({ - fit_b <- stats::glm(fml, data = sub, family = stats::binomial()) - pmax(pmin(stats::predict(fit_b, type = "response"), 1 - 1e-6), 1e-6) - }, error = function(e) NA_real_) - if (all(is.na(e_b))) NA_real_ else { - Db <- as.integer(sub[[treatment_col]]); Yb <- as.numeric(sub[[outcome_col]]) - w1b <- Db / e_b; w0b <- (1 - Db) / (1 - e_b) + e_b <- tryCatch( + { + fit_b <- stats::glm(fml, data = sub, family = stats::binomial()) + pmax(pmin(stats::predict(fit_b, type = "response"), 1 - 1e-6), 1e-6) + }, + error = function(e) NA_real_ + ) + if (all(is.na(e_b))) { + NA_real_ + } else { + Db <- as.integer(sub[[treatment_col]]) + Yb <- as.numeric(sub[[outcome_col]]) + w1b <- Db / e_b + w0b <- (1 - Db) / (1 - e_b) sum(w1b * Yb) / sum(w1b) - sum(w0b * Yb) / sum(w0b) } }) se <- stats::sd(boots, na.rm = TRUE) } else { tau <- mean(Y[D == 1]) - mean(Y[D == 0]) - se <- sqrt(stats::var(Y[D == 1]) / sum(D == 1) + - stats::var(Y[D == 0]) / sum(D == 0)) + se <- sqrt(stats::var(Y[D == 1]) / sum(D == 1) + + stats::var(Y[D == 0]) / sum(D == 0)) } z <- 1.96 @@ -271,10 +303,12 @@ mrm_causal_design <- function( estimate = round(tau, 6), se = round(se, 6), ci_lower = round(tau - z * se, 6), ci_upper = round(tau + z * se, 6), - p_value = 2 * (1 - stats::pnorm(abs(tau / se))), + p_value = 2 * (1 - stats::pnorm(abs(tau / se))), n = n, n_treated = as.integer(n_t), - interpretation = sprintf("%s ATE = %.4f (SE %.4f); 95%% CI [%.4f, %.4f]", - toupper(estimator), tau, se, - tau - z * se, tau + z * se) + interpretation = sprintf( + "%s ATE = %.4f (SE %.4f); 95%% CI [%.4f, %.4f]", + toupper(estimator), tau, se, + tau - z * se, tau + z * se + ) ) } diff --git a/r-package/morie/R/mrm_diagnostics.R b/r-package/morie/R/mrm_diagnostics.R index 043a17192f..0eae52dec6 100644 --- a/r-package/morie/R/mrm_diagnostics.R +++ b/r-package/morie/R/mrm_diagnostics.R @@ -14,6 +14,20 @@ #' Cole, S. R., & Hernan, M. A. (2008). Constructing inverse probability #' weights for marginal structural models. AJE, 168(6), 656-664. #' +#' @return Each diagnostic callable returns a named \code{list} of balance +#' and overlap statistics (or the estimated effect) together with a +#' plain-language \code{interpretation}. +#' @examples +#' set.seed(2026) +#' n <- 200L +#' df <- data.frame( +#' D = rbinom(n, 1, 0.4), +#' age = rnorm(n, 50, 10), bmi = rnorm(n, 27, 4) +#' ) +#' mrm_standardised_difference(df, +#' treatment_col = "D", +#' covariates = c("age", "bmi") +#' ) #' @name mrm_diagnostics NULL @@ -45,26 +59,31 @@ NULL #' age = rnorm(n, 50, 10), #' bmi = rnorm(n, 27, 4) #' ) -#' df$age[df$D == 1] <- df$age[df$D == 1] + 3 # deliberate imbalance -#' mrm_standardised_difference(df, treatment_col = "D", -#' covariates = c("age", "bmi")) +#' df$age[df$D == 1] <- df$age[df$D == 1] + 3 # deliberate imbalance +#' mrm_standardised_difference(df, +#' treatment_col = "D", +#' covariates = c("age", "bmi") +#' ) #' @export mrm_standardised_difference <- function(data, treatment_col, covariates) { D <- as.integer(data[[treatment_col]]) out <- lapply(covariates, function(c) { - x <- as.numeric(data[[c]]) - x_t <- x[D == 1]; x_c <- x[D == 0] - m_t <- mean(x_t); m_c <- mean(x_c) - s_t <- stats::var(x_t); s_c <- stats::var(x_c) + x <- as.numeric(data[[c]]) + x_t <- x[D == 1] + x_c <- x[D == 0] + m_t <- mean(x_t) + m_c <- mean(x_c) + s_t <- stats::var(x_t) + s_c <- stats::var(x_c) pooled_sd <- sqrt((s_t + s_c) / 2) smd_pct <- if (pooled_sd > 0) 100 * (m_t - m_c) / pooled_sd else NA_real_ data.frame( - covariate = c, + covariate = c, mean_treated = round(m_t, 4), mean_control = round(m_c, 4), - pooled_sd = round(pooled_sd, 4), - smd_pct = round(smd_pct, 2), - imbalanced = if (is.na(smd_pct)) NA else abs(smd_pct) > 10, + pooled_sd = round(pooled_sd, 4), + smd_pct = round(smd_pct, 2), + imbalanced = if (is.na(smd_pct)) NA else abs(smd_pct) > 10, stringsAsFactors = FALSE ) }) @@ -90,14 +109,16 @@ mrm_standardised_difference <- function(data, treatment_col, covariates) { #' age = rnorm(n, 50, 10), #' bmi = rnorm(n, 27, 4) #' ) -#' df$age[df$D == 1] <- df$age[df$D == 1] + 3 # imbalance on age -#' bal <- mrm_check_balancing(df, treatment_col = "D", -#' covariates = c("age", "bmi")) +#' df$age[df$D == 1] <- df$age[df$D == 1] + 3 # imbalance on age +#' bal <- mrm_check_balancing(df, +#' treatment_col = "D", +#' covariates = c("age", "bmi") +#' ) #' bal$overall_balanced #' bal$interpretation #' @export mrm_check_balancing <- function(data, treatment_col, covariates, - threshold_pct = 10) { + threshold_pct = 10) { tbl <- mrm_standardised_difference(data, treatment_col, covariates) n_imb <- sum(tbl$imbalanced, na.rm = TRUE) overall <- n_imb == 0L @@ -129,8 +150,10 @@ mrm_check_balancing <- function(data, treatment_col, covariates, #' x <- rnorm(n) #' D <- rbinom(n, 1, plogis(0.5 * x)) #' df <- data.frame(D = D, age = x) -#' ovl <- mrm_check_overlap(df, treatment_col = "D", -#' covariates = "age") +#' ovl <- mrm_check_overlap(df, +#' treatment_col = "D", +#' covariates = "age" +#' ) #' ovl$positivity_violations #' ovl$interpretation #' @export @@ -138,18 +161,23 @@ mrm_check_overlap <- function(data, treatment_col, covariates) { D <- as.integer(data[[treatment_col]]) X <- as.data.frame(data[, covariates, drop = FALSE]) e <- .morie_logistic_propensity(D, X) - e_t <- e[D == 1]; e_c <- e[D == 0] - qs <- c(0.025, 0.25, 0.5, 0.75, 0.975) + e_t <- e[D == 1] + e_c <- e[D == 0] + qs <- c(0.025, 0.25, 0.5, 0.75, 0.975) cs_lo <- max(min(e_t), min(e_c)) cs_hi <- min(max(e_t), max(e_c)) n_outside <- sum(e < cs_lo | e > cs_hi) pviol <- sum(e < 0.01 | e > 0.99) name_q <- function(q) paste0("q", q * 100) list( - e_treated_quantiles = setNames(as.list(round(stats::quantile(e_t, qs), 4)), - name_q(qs)), - e_control_quantiles = setNames(as.list(round(stats::quantile(e_c, qs), 4)), - name_q(qs)), + e_treated_quantiles = setNames( + as.list(round(stats::quantile(e_t, qs), 4)), + name_q(qs) + ), + e_control_quantiles = setNames( + as.list(round(stats::quantile(e_c, qs), 4)), + name_q(qs) + ), common_support_lower = round(cs_lo, 4), common_support_upper = round(cs_hi, 4), n_outside_support = as.integer(n_outside), @@ -177,14 +205,16 @@ mrm_check_overlap <- function(data, treatment_col, covariates) { #' D <- rbinom(n, 1, plogis(0.5 * x)) #' y <- 0.7 * D + 0.3 * x + rnorm(n, 0, 0.5) #' df <- data.frame(D = D, y = y, age = x) -#' res <- mrm_median_causal_effect(df, treatment_col = "D", -#' outcome_col = "y", -#' covariates = "age") +#' res <- mrm_median_causal_effect(df, +#' treatment_col = "D", +#' outcome_col = "y", +#' covariates = "age" +#' ) #' res$median_treatment_effect #' res$n_matched #' @export mrm_median_causal_effect <- function(data, treatment_col, outcome_col, - covariates) { + covariates) { D <- as.integer(data[[treatment_col]]) Y <- as.numeric(data[[outcome_col]]) X <- as.data.frame(data[, covariates, drop = FALSE]) @@ -200,14 +230,16 @@ mrm_median_causal_effect <- function(data, treatment_col, outcome_col, if (!length(avail)) break dists <- abs(logit[avail] - logit[i]) j <- avail[which.min(dists)] - used <- c(used, j); k <- k + 1L + used <- c(used, j) + k <- k + 1L pairs[[k]] <- c(i, j) } pairs <- pairs[seq_len(k)] if (!length(pairs)) stop("no valid matches") Y1 <- vapply(pairs, function(p) Y[p[1]], numeric(1)) Y0 <- vapply(pairs, function(p) Y[p[2]], numeric(1)) - m1 <- stats::median(Y1); m0 <- stats::median(Y0) + m1 <- stats::median(Y1) + m0 <- stats::median(Y0) list( median_y1 = round(m1, 4), median_y0 = round(m0, 4), @@ -239,13 +271,15 @@ mrm_median_causal_effect <- function(data, treatment_col, outcome_col, #' D <- rbinom(n, 1, plogis(0.5 * x)) #' y <- 0.7 * D + 0.3 * x + rnorm(n) #' df <- data.frame(D = D, y = y, age = x) -#' chk <- mrm_assumptions_check(df, treatment_col = "D", -#' outcome_col = "y", -#' covariates = "age") +#' chk <- mrm_assumptions_check(df, +#' treatment_col = "D", +#' outcome_col = "y", +#' covariates = "age" +#' ) #' chk$overall_verdict #' @export mrm_assumptions_check <- function(data, treatment_col, outcome_col, - covariates) { + covariates) { overlap <- mrm_check_overlap(data, treatment_col, covariates) balance <- mrm_check_balancing(data, treatment_col, covariates) sutva <- list( @@ -273,7 +307,7 @@ mrm_assumptions_check <- function(data, treatment_col, outcome_col, ) ) overall <- if (balance$overall_balanced && - overlap$positivity_violations == 0L) { + overlap$positivity_violations == 0L) { "all three assumptions ok modulo SUTVA design-context" } else { "one or more diagnostic flags; see fields" diff --git a/r-package/morie/R/mrm_doe.R b/r-package/morie/R/mrm_doe.R index ff30320ad9..3982bcd07c 100644 --- a/r-package/morie/R/mrm_doe.R +++ b/r-package/morie/R/mrm_doe.R @@ -14,6 +14,17 @@ #' optimum conditions. JRSS-B, 13(1), 1-45. #' Cohen, J. (1988). Statistical Power Analysis for the Behavioral Sciences. #' +#' @return Each design-of-experiments callable returns a named \code{list} +#' holding the constructed design or the analysis result and a +#' plain-language \code{interpretation}. +#' @examples +#' set.seed(2026) +#' n <- 30L +#' df <- data.frame( +#' y = c(rnorm(n, 0), rnorm(n, 0.5), rnorm(n, 1)), +#' g = rep(c("A", "B", "C"), each = n) +#' ) +#' mrm_anova_bonferroni(df, response_col = "y", group_col = "g")$alpha_per_pair #' @name mrm_doe NULL @@ -34,17 +45,19 @@ NULL #' g = rep(c("A", "B", "C"), each = n) #' ) #' res <- mrm_anova_bonferroni(df, response_col = "y", group_col = "g") -#' res$alpha_per_pair # Bonferroni-corrected per-pair alpha -#' res$pairs # per-pair t-tests with adjusted significance flags +#' res$alpha_per_pair # Bonferroni-corrected per-pair alpha +#' res$pairs # per-pair t-tests with adjusted significance flags #' @export mrm_anova_bonferroni <- function(data, response_col, group_col, alpha = 0.05) { d <- data[, c(response_col, group_col)] d <- d[stats::complete.cases(d), , drop = FALSE] d[[group_col]] <- factor(d[[group_col]]) fit <- stats::aov(stats::as.formula(paste(response_col, "~", group_col)), - data = d) + data = d + ) summ <- summary(fit)[[1]] - f <- summ[["F value"]][1]; p_anova <- summ[["Pr(>F)"]][1] + f <- summ[["F value"]][1] + p_anova <- summ[["Pr(>F)"]][1] groups <- levels(d[[group_col]]) pairs_list <- list() k <- 0L @@ -95,25 +108,33 @@ mrm_anova_bonferroni <- function(data, response_col, group_col, alpha = 0.05) { #' n_blocks, interpretation. #' @examples #' set.seed(2026) -#' df <- expand.grid(treatment = c("A", "B", "C"), -#' block = c("B1", "B2", "B3", "B4")) +#' df <- expand.grid( +#' treatment = c("A", "B", "C"), +#' block = c("B1", "B2", "B3", "B4") +#' ) #' # Treatment effect + block effect + noise #' df$y <- as.numeric(df$treatment) * 2 + #' as.numeric(df$block) * 0.5 + rnorm(nrow(df), 0, 0.3) -#' res <- mrm_rcbd(df, response_col = "y", -#' treatment_col = "treatment", block_col = "block") +#' res <- mrm_rcbd(df, +#' response_col = "y", +#' treatment_col = "treatment", block_col = "block" +#' ) #' res$anova #' @export mrm_rcbd <- function(data, response_col, treatment_col, block_col) { d <- data[, c(response_col, treatment_col, block_col)] d <- d[stats::complete.cases(d), , drop = FALSE] d[[treatment_col]] <- factor(d[[treatment_col]]) - d[[block_col]] <- factor(d[[block_col]]) - fit <- stats::aov(stats::as.formula(paste( - response_col, "~", block_col, "+", treatment_col)), - data = d) + d[[block_col]] <- factor(d[[block_col]]) + fit <- stats::aov( + stats::as.formula(paste( + response_col, "~", block_col, "+", treatment_col + )), + data = d + ) summ <- as.data.frame(summary(fit)[[1]]) - summ$source <- trimws(rownames(summ)); rownames(summ) <- NULL + summ$source <- trimws(rownames(summ)) + rownames(summ) <- NULL list( anova = summ, n = nrow(d), @@ -142,21 +163,27 @@ mrm_rcbd <- function(data, response_col, treatment_col, block_col) { #' df$treatment <- LETTERS[as.integer(as.vector(sq)) + 1L] #' set.seed(2026) #' df$y <- match(df$treatment, LETTERS) * 1.5 + rnorm(16, 0, 0.4) -#' res <- mrm_latin_square(df, response_col = "y", -#' row_col = "row", col_col = "col", -#' treatment_col = "treatment") +#' res <- mrm_latin_square(df, +#' response_col = "y", +#' row_col = "row", col_col = "col", +#' treatment_col = "treatment" +#' ) #' res$anova #' @export mrm_latin_square <- function(data, response_col, row_col, col_col, - treatment_col) { + treatment_col) { d <- data[, c(response_col, row_col, col_col, treatment_col)] d <- d[stats::complete.cases(d), , drop = FALSE] for (c in c(row_col, col_col, treatment_col)) d[[c]] <- factor(d[[c]]) - fit <- stats::aov(stats::as.formula(paste( - response_col, "~", row_col, "+", col_col, "+", treatment_col)), - data = d) + fit <- stats::aov( + stats::as.formula(paste( + response_col, "~", row_col, "+", col_col, "+", treatment_col + )), + data = d + ) summ <- as.data.frame(summary(fit)[[1]]) - summ$source <- trimws(rownames(summ)); rownames(summ) <- NULL + summ$source <- trimws(rownames(summ)) + rownames(summ) <- NULL list( anova = summ, n = nrow(d), @@ -178,36 +205,46 @@ mrm_latin_square <- function(data, response_col, row_col, col_col, #' @examples #' # Hardcoded 4 x 4 orthogonal Graeco-Latin square (two random Latin #' # squares are generally NOT orthogonal, so we use a known pair): -#' L <- matrix(c("A","B","C","D", -#' "B","A","D","C", -#' "C","D","A","B", -#' "D","C","B","A"), nrow = 4L, byrow = TRUE) -#' G <- matrix(c("a","b","c","d", -#' "c","d","a","b", -#' "d","c","b","a", -#' "b","a","d","c"), nrow = 4L, byrow = TRUE) +#' L <- matrix(c( +#' "A", "B", "C", "D", +#' "B", "A", "D", "C", +#' "C", "D", "A", "B", +#' "D", "C", "B", "A" +#' ), nrow = 4L, byrow = TRUE) +#' G <- matrix(c( +#' "a", "b", "c", "d", +#' "c", "d", "a", "b", +#' "d", "c", "b", "a", +#' "b", "a", "d", "c" +#' ), nrow = 4L, byrow = TRUE) #' set.seed(2026) #' df <- expand.grid(row = paste0("R", 1:4), col = paste0("C", 1:4)) #' df$latin <- as.vector(L) #' df$greek <- as.vector(G) #' df$y <- match(df$latin, LETTERS) * 1.2 + #' match(df$greek, letters) * 0.5 + rnorm(16, 0, 0.3) -#' res <- mrm_graeco_latin(df, response_col = "y", -#' row_col = "row", col_col = "col", -#' latin_col = "latin", greek_col = "greek") +#' res <- mrm_graeco_latin(df, +#' response_col = "y", +#' row_col = "row", col_col = "col", +#' latin_col = "latin", greek_col = "greek" +#' ) #' res$anova #' @export mrm_graeco_latin <- function(data, response_col, row_col, col_col, - latin_col, greek_col) { + latin_col, greek_col) { d <- data[, c(response_col, row_col, col_col, latin_col, greek_col)] d <- d[stats::complete.cases(d), , drop = FALSE] for (c in c(row_col, col_col, latin_col, greek_col)) d[[c]] <- factor(d[[c]]) - fit <- stats::aov(stats::as.formula(paste( - response_col, "~", row_col, "+", col_col, "+", - latin_col, "+", greek_col)), - data = d) + fit <- stats::aov( + stats::as.formula(paste( + response_col, "~", row_col, "+", col_col, "+", + latin_col, "+", greek_col + )), + data = d + ) summ <- as.data.frame(summary(fit)[[1]]) - summ$source <- trimws(rownames(summ)); rownames(summ) <- NULL + summ$source <- trimws(rownames(summ)) + rownames(summ) <- NULL list( anova = summ, n = nrow(d), @@ -241,12 +278,14 @@ mrm_graeco_latin <- function(data, response_col, row_col, col_col, #' C = c(1, -1, -1, 1) #' ) #' df$y <- 5 + 2 * df$A + 1.5 * df$B + rnorm(4, 0, 0.3) -#' res <- mrm_fractional_factorial(df, response_col = "y", -#' factor_cols = c("A", "B", "C")) +#' res <- mrm_fractional_factorial(df, +#' response_col = "y", +#' factor_cols = c("A", "B", "C") +#' ) #' res$main_effects #' @export mrm_fractional_factorial <- function(data, response_col, factor_cols, - generator = NULL) { + generator = NULL) { d <- data[, c(response_col, factor_cols), drop = FALSE] d <- d[stats::complete.cases(d), , drop = FALSE] y <- d[[response_col]] @@ -287,12 +326,16 @@ mrm_fractional_factorial <- function(data, response_col, factor_cols, #' @examples #' # Central composite design on (x1, x2) with quadratic response. #' set.seed(2026) -#' df <- expand.grid(x1 = c(-1.4, -1, 0, 1, 1.4), -#' x2 = c(-1.4, -1, 0, 1, 1.4)) +#' df <- expand.grid( +#' x1 = c(-1.4, -1, 0, 1, 1.4), +#' x2 = c(-1.4, -1, 0, 1, 1.4) +#' ) #' df$y <- 10 + 2 * df$x1 + 1.5 * df$x2 - #' df$x1^2 - 1.2 * df$x2^2 + rnorm(nrow(df), 0, 0.2) -#' res <- mrm_response_surface(df, response_col = "y", -#' factor_cols = c("x1", "x2")) +#' res <- mrm_response_surface(df, +#' response_col = "y", +#' factor_cols = c("x1", "x2") +#' ) #' res$stationary_point #' res$stationary_nature #' @export @@ -305,7 +348,7 @@ mrm_response_surface <- function(data, response_col, factor_cols) { cols <- list(intercept = rep(1, nrow(X))) for (i in seq_len(k)) cols[[factor_cols[i]]] <- X[, i] - for (i in seq_len(k)) cols[[paste0(factor_cols[i], "^2")]] <- X[, i] ^ 2 + for (i in seq_len(k)) cols[[paste0(factor_cols[i], "^2")]] <- X[, i]^2 for (i in seq_len(k - 1L)) { for (j in seq.int(i + 1L, k)) { cols[[paste0(factor_cols[i], ":", factor_cols[j])]] <- X[, i] * X[, j] @@ -332,13 +375,20 @@ mrm_response_surface <- function(data, response_col, factor_cols) { y_star <- NA_real_ } eigvals <- eigen(B, symmetric = TRUE, only.values = TRUE)$values - nature <- if (all(eigvals < 0)) "maximum" - else if (all(eigvals > 0)) "minimum" - else "saddle" + nature <- if (all(eigvals < 0)) { + "maximum" + } else if (all(eigvals > 0)) { + "minimum" + } else { + "saddle" + } list( coefficients = as.list(round(beta, 6)), - stationary_point = if (is.null(x_star)) NULL else - setNames(as.list(round(as.numeric(x_star), 4)), factor_cols), + stationary_point = if (is.null(x_star)) { + NULL + } else { + setNames(as.list(round(as.numeric(x_star), 4)), factor_cols) + }, stationary_y = if (is.null(x_star)) NULL else round(y_star, 4), stationary_nature = nature, eigenvalues = round(eigvals, 4), @@ -362,22 +412,27 @@ mrm_response_surface <- function(data, response_col, factor_cols) { #' @examples #' # Power to detect a medium effect (Cohen's f = 0.25) with 4 groups #' # of 30 each at alpha = 0.05: -#' res <- mrm_anova_power(k_groups = 4, n_per_group = 30, -#' effect_size_f = 0.25, alpha = 0.05) +#' res <- mrm_anova_power( +#' k_groups = 4, n_per_group = 30, +#' effect_size_f = 0.25, alpha = 0.05 +#' ) #' res$power #' res$F_critical #' #' # Sample-size sensitivity: what power do I get with smaller groups? -#' sapply(c(10, 20, 30, 50, 100), function(n) -#' mrm_anova_power(k_groups = 3, n_per_group = n, -#' effect_size_f = 0.25)$power) +#' sapply(c(10, 20, 30, 50, 100), function(n) { +#' mrm_anova_power( +#' k_groups = 3, n_per_group = n, +#' effect_size_f = 0.25 +#' )$power +#' }) #' @export mrm_anova_power <- function(k_groups, n_per_group, effect_size_f, - alpha = 0.05) { + alpha = 0.05) { df1 <- k_groups - 1L N <- k_groups * n_per_group df2 <- N - k_groups - ncp <- N * effect_size_f ^ 2 + ncp <- N * effect_size_f^2 F_crit <- stats::qf(1 - alpha, df1, df2) power <- 1 - stats::pf(F_crit, df1, df2, ncp = ncp) list( @@ -415,13 +470,16 @@ mrm_anova_power <- function(k_groups, n_per_group, effect_size_f, #' } #' res <- mrm_mc_power(my_sim, n_sims = 500L, alpha = 0.05) #' res$empirical_power -#' res$ci95_lower; res$ci95_upper +#' res$ci95_lower +#' res$ci95_upper #' @export mrm_mc_power <- function(simulator, n_sims = 1000L, alpha = 0.05, seed = 42L) { set.seed(seed) - p_values <- vapply(seq_len(n_sims), - function(i) simulator(sample.int(.Machine$integer.max, 1)), - numeric(1)) + p_values <- vapply( + seq_len(n_sims), + function(i) simulator(sample.int(.Machine$integer.max, 1)), + numeric(1) + ) pwr <- mean(p_values < alpha) se <- sqrt(pwr * (1 - pwr) / n_sims) list( @@ -451,20 +509,24 @@ mrm_mc_power <- function(simulator, n_sims = 1000L, alpha = 0.05, seed = 42L) { #' interpretation. #' @examples #' set.seed(2026) -#' df <- expand.grid(block = paste0("B", 1:6), -#' treatment = c("ctrl", "drug")) +#' df <- expand.grid( +#' block = paste0("B", 1:6), +#' treatment = c("ctrl", "drug") +#' ) #' # Block-level baseline + treatment effect #' df$y <- as.numeric(df$block) * 1.2 + #' ifelse(df$treatment == "drug", 0.7, 0) + #' rnorm(nrow(df), 0, 0.4) -#' res <- mrm_perm_block(df, response_col = "y", -#' treatment_col = "treatment", -#' block_col = "block", -#' n_perm = 500L) +#' res <- mrm_perm_block(df, +#' response_col = "y", +#' treatment_col = "treatment", +#' block_col = "block", +#' n_perm = 500L +#' ) #' res$p_value #' @export mrm_perm_block <- function(data, response_col, treatment_col, block_col, - n_perm = 1000L, seed = 42L) { + n_perm = 1000L, seed = 42L) { d <- data[, c(response_col, treatment_col, block_col)] d <- d[stats::complete.cases(d), , drop = FALSE] trt <- d[[treatment_col]] @@ -515,14 +577,18 @@ mrm_perm_block <- function(data, response_col, treatment_col, block_col, #' mrm_random_latin(k = 4, seed = 42L) #' #' # Reproducible across runs with the same seed: -#' identical(mrm_random_latin(5, seed = 7), -#' mrm_random_latin(5, seed = 7)) +#' identical( +#' mrm_random_latin(5, seed = 7), +#' mrm_random_latin(5, seed = 7) +#' ) #' @export mrm_random_latin <- function(k, seed = 42L) { set.seed(seed) base <- matrix(0L, k, k) - for (i in seq_len(k)) for (j in seq_len(k)) { - base[i, j] <- ((i + j - 2L) %% k) + for (i in seq_len(k)) { + for (j in seq_len(k)) { + base[i, j] <- ((i + j - 2L) %% k) + } } row_perm <- sample.int(k) col_perm <- sample.int(k) diff --git a/r-package/morie/R/mrm_kulldorff.R b/r-package/morie/R/mrm_kulldorff.R index a53e37c13f..76eec3fda8 100644 --- a/r-package/morie/R/mrm_kulldorff.R +++ b/r-package/morie/R/mrm_kulldorff.R @@ -15,12 +15,22 @@ #' Kulldorff, M. (1997). A spatial scan statistic. Communications in #' Statistics: Theory and Methods, 26(6), 1481--1496. #' +#' @return \code{mrm_tps_kulldorff_scan()} returns a named \code{list} with +#' the most likely cluster, its Poisson log-likelihood-ratio statistic, +#' the Monte-Carlo permutation p-value, and a plain-language +#' \code{interpretation}. +#' @examples +#' if (FALSE) { +#' tps <- morie_sample("tps_assault") +#' mrm_tps_kulldorff_scan(tps, n_permutations = 49) +#' } #' @name mrm_kulldorff NULL .haversine_km_mat <- function(lat1, lon1, lat2, lon2) { - R <- 6371; rad <- pi / 180 + R <- 6371 + rad <- pi / 180 dlat <- (lat2 - lat1) * rad dlon <- (lon2 - lon1) * rad a <- sin(dlat / 2)^2 + cos(lat1 * rad) * cos(lat2 * rad) * sin(dlon / 2)^2 @@ -29,10 +39,14 @@ NULL .poisson_lrt <- function(n_obs, n_in, n_exp, n_tot) { - if (n_in == 0 || n_obs == 0 || n_obs <= n_exp) return(0.0) + if (n_in == 0 || n_obs == 0 || n_obs <= n_exp) { + return(0.0) + } obs_out <- n_tot - n_obs exp_out <- n_tot - n_exp - if (obs_out == 0 || exp_out <= 0) return(0.0) + if (obs_out == 0 || exp_out <= 0) { + return(0.0) + } n_obs * log(n_obs / n_exp) + obs_out * log(obs_out / exp_out) } @@ -61,8 +75,8 @@ NULL mrm_tps_kulldorff_scan <- function( data, date_col = "OCC_DATE", - lat_col = "LAT_WGS84", - lon_col = "LONG_WGS84", + lat_col = "LAT_WGS84", + lon_col = "LONG_WGS84", radii_km = c(1, 2, 3, 5, 8), window_years = 4, n_centers = 60L, @@ -74,18 +88,22 @@ mrm_tps_kulldorff_scan <- function( set.seed(as.integer(seed)) df <- data[, c(date_col, lat_col, lon_col)] - d <- suppressWarnings(as.POSIXct(df[[date_col]], - format = "%m/%d/%Y %I:%M:%S %p", - tz = "UTC")) + d <- suppressWarnings(as.POSIXct(df[[date_col]], + format = "%m/%d/%Y %I:%M:%S %p", + tz = "UTC" + )) if (all(is.na(d))) d <- suppressWarnings(as.POSIXct(df[[date_col]], tz = "UTC")) df[[date_col]] <- as.Date(d) df <- df[stats::complete.cases(df), ] df <- df[order(df[[date_col]]), ] n <- nrow(df) - if (n < 100L) return(data.frame()) + if (n < 100L) { + return(data.frame()) + } - lat <- df[[lat_col]]; lon <- df[[lon_col]] - t <- as.integer(df[[date_col]]) + lat <- df[[lat_col]] + lon <- df[[lon_col]] + t <- as.integer(df[[date_col]]) center_idx <- sample.int(n, min(n_centers, n)) window_days <- round(window_years * 365.25) starts <- seq(min(t), max(t) - window_days, length.out = max(2L, (max(t) - min(t)) %/% window_days)) @@ -108,8 +126,10 @@ mrm_tps_kulldorff_scan <- function( n_exp <- n_space * sum(in_time) / n lrt <- .poisson_lrt(n_in_cyl, n_space, n_exp, n) if (lrt > best$lrt) { - best <- list(lrt = lrt, ci = ci, ri = ri, ti = ti, - n_in = n_in_cyl, n_space = n_space) + best <- list( + lrt = lrt, ci = ci, ri = ri, ti = ti, + n_in = n_in_cyl, n_space = n_space + ) } } } @@ -117,29 +137,31 @@ mrm_tps_kulldorff_scan <- function( best } - obs <- scan_one(t) + obs <- scan_one(t) null <- numeric(n_permutations) for (k in seq_len(n_permutations)) null[k] <- scan_one(sample(t))$lrt p_value <- (sum(null >= obs$lrt) + 1) / (n_permutations + 1) - if (obs$ci < 0) return(data.frame()) + if (obs$ci < 0) { + return(data.frame()) + } - r <- radii_km[obs$ri] + r <- radii_km[obs$ri] t_start <- as.Date(obs$ti, origin = "1970-01-01") - t_end <- as.Date(obs$ti + window_days, origin = "1970-01-01") - n_exp <- obs$n_space * window_days / max(1L, max(t) - min(t)) - rr <- obs$n_in / n_exp + t_end <- as.Date(obs$ti + window_days, origin = "1970-01-01") + n_exp <- obs$n_space * window_days / max(1L, max(t) - min(t)) + rr <- obs$n_in / n_exp data.frame( - center_lat = lat[obs$ci], - center_lon = lon[obs$ci], - radius_km = r, - t_start = t_start, - t_end = t_end, - n_observed = obs$n_in, - n_expected = round(n_exp, 2), + center_lat = lat[obs$ci], + center_lon = lon[obs$ci], + radius_km = r, + t_start = t_start, + t_end = t_end, + n_observed = obs$n_in, + n_expected = round(n_exp, 2), relative_risk = round(rr, 3), - log_lrt = round(obs$lrt, 2), - p_value = round(p_value, 4) + log_lrt = round(obs$lrt, 2), + p_value = round(p_value, 4) ) } diff --git a/r-package/morie/R/mrm_lisa.R b/r-package/morie/R/mrm_lisa.R index 6ebf32e7b6..c462fc2573 100644 --- a/r-package/morie/R/mrm_lisa.R +++ b/r-package/morie/R/mrm_lisa.R @@ -13,12 +13,24 @@ #' Anselin, L. (1995). Local indicators of spatial association -- #' LISA. \emph{Geographical Analysis}, 27(2), 93--115. #' +#' @return The LISA callables return named \code{list}s with per-polygon +#' local Moran's I, permutation p-values, cluster classifications, and +#' (for the per-year wrapper) the time series of global Moran's I. +#' @examples +#' if (FALSE) { +#' ncr <- read.csv("Neighbourhood_Crime_Rates_Open_Data.csv") +#' mrm_tps_lisa(ncr, +#' count_col = "ASSAULT_2024", +#' lat_col = "lat", lon_col = "lon" +#' ) +#' } #' @name mrm_lisa NULL .haversine_km_lisa <- function(lat1, lon1, lat2, lon2) { - R <- 6371; rad <- pi / 180 + R <- 6371 + rad <- pi / 180 dlat <- (lat2 - lat1) * rad dlon <- (lon2 - lon1) * rad a <- sin(dlat / 2)^2 + cos(lat1 * rad) * cos(lat2 * rad) * sin(dlon / 2)^2 @@ -30,7 +42,7 @@ NULL n <- length(lat) W <- matrix(0, n, n) for (i in seq_len(n)) { - d <- .haversine_km_lisa(lat[i], lon[i], lat, lon) + d <- .haversine_km_lisa(lat[i], lon[i], lat, lon) nn <- order(d)[2:(k + 1L)] W[i, nn] <- 1.0 / k } @@ -58,14 +70,16 @@ NULL #' @examples #' if (FALSE) { #' ncr <- read.csv("Neighbourhood_Crime_Rates_Open_Data.csv") -#' res <- mrm_tps_lisa(ncr, count_col = "ASSAULT_2024", -#' lat_col = "lat", lon_col = "lon") +#' res <- mrm_tps_lisa(ncr, +#' count_col = "ASSAULT_2024", +#' lat_col = "lat", lon_col = "lon" +#' ) #' } mrm_tps_lisa <- function( data, count_col, lat_col = "lat", lon_col = "lon", - id_col = NULL, + id_col = NULL, k = 6L, n_permutations = 999L, seed = 42L ) { stopifnot(is.data.frame(data), count_col %in% names(data)) @@ -73,22 +87,23 @@ mrm_tps_lisa <- function( set.seed(as.integer(seed)) keep <- stats::complete.cases(data[, c(count_col, lat_col, lon_col)]) - d <- data[keep, , drop = FALSE] - n <- nrow(d) + d <- data[keep, , drop = FALSE] + n <- nrow(d) if (n < 5L) stop("need >= 5 polygons") - lat <- d[[lat_col]]; lon <- d[[lon_col]] - x <- as.numeric(d[[count_col]]) - W <- .knn_weights_lisa(lat, lon, k) - z <- (x - mean(x)) / stats::sd(x) + lat <- d[[lat_col]] + lon <- d[[lon_col]] + x <- as.numeric(d[[count_col]]) + W <- .knn_weights_lisa(lat, lon, k) + z <- (x - mean(x)) / stats::sd(x) lag <- as.vector(W %*% z) - I_local <- z * lag + I_local <- z * lag I_global <- sum(I_local) / sum(z^2) quad <- character(n) - quad[(z > 0) & (lag > 0)] <- "HH" - quad[(z > 0) & (lag <= 0)] <- "HL" - quad[(z <= 0) & (lag > 0)] <- "LH" + quad[(z > 0) & (lag > 0)] <- "HH" + quad[(z > 0) & (lag <= 0)] <- "HL" + quad[(z <= 0) & (lag > 0)] <- "LH" quad[(z <= 0) & (lag <= 0)] <- "LL" p_local <- numeric(n) @@ -111,20 +126,24 @@ mrm_tps_lisa <- function( stringsAsFactors = FALSE ) - qa <- c(HH = sum(quad == "HH"), HL = sum(quad == "HL"), - LH = sum(quad == "LH"), LL = sum(quad == "LL")) - qs <- c(HH = sum((quad == "HH") & (p_local <= 0.05)), - HL = sum((quad == "HL") & (p_local <= 0.05)), - LH = sum((quad == "LH") & (p_local <= 0.05)), - LL = sum((quad == "LL") & (p_local <= 0.05))) + qa <- c( + HH = sum(quad == "HH"), HL = sum(quad == "HL"), + LH = sum(quad == "LH"), LL = sum(quad == "LL") + ) + qs <- c( + HH = sum((quad == "HH") & (p_local <= 0.05)), + HL = sum((quad == "HL") & (p_local <= 0.05)), + LH = sum((quad == "LH") & (p_local <= 0.05)), + LL = sum((quad == "LL") & (p_local <= 0.05)) + ) list( - n_polygons = n, - global_moran_I = round(I_global, 4), - permutations = as.integer(n_permutations), - knn_k = as.integer(k), - table = tbl, - quadrants_all = as.list(qa), + n_polygons = n, + global_moran_I = round(I_global, 4), + permutations = as.integer(n_permutations), + knn_k = as.integer(k), + table = tbl, + quadrants_all = as.list(qa), quadrants_significant_p05 = as.list(qs), n_significant_p05 = sum(qs) ) @@ -146,8 +165,10 @@ mrm_tps_lisa <- function( #' @examples #' # 4 x 4 polygon grid with two yearly count columns. #' set.seed(2026) -#' grid <- expand.grid(lat = 43.6 + (0:3) * 0.02, -#' lon = -79.4 + (0:3) * 0.02) +#' grid <- expand.grid( +#' lat = 43.6 + (0:3) * 0.02, +#' lon = -79.4 + (0:3) * 0.02 +#' ) #' grid$ASSAULT_2023 <- rpois(nrow(grid), lambda = grid$lat * 10) #' grid$ASSAULT_2024 <- rpois(nrow(grid), lambda = grid$lat * 12) #' res <- mrm_tps_polygon_moran_per_year( @@ -168,9 +189,11 @@ mrm_tps_polygon_moran_per_year <- function( yr <- regmatches(c, regexpr("\\d{4}", c)) yr <- if (length(yr) > 0) as.integer(yr) else c res <- tryCatch( - mrm_tps_lisa(data, count_col = c, lat_col = lat_col, - lon_col = lon_col, k = k, - n_permutations = n_permutations, seed = seed), + mrm_tps_lisa(data, + count_col = c, lat_col = lat_col, + lon_col = lon_col, k = k, + n_permutations = n_permutations, seed = seed + ), error = function(e) NULL ) if (is.null(res)) next diff --git a/r-package/morie/R/mrm_mandela_spectrum.R b/r-package/morie/R/mrm_mandela_spectrum.R index 48d14c56b5..a3ffe45a70 100644 --- a/r-package/morie/R/mrm_mandela_spectrum.R +++ b/r-package/morie/R/mrm_mandela_spectrum.R @@ -49,8 +49,8 @@ #' @references #' United Nations General Assembly (2015). United Nations Standard #' Minimum Rules for the Treatment of Prisoners (the Nelson Mandela -#' Rules). A/RES/70/175. Rule 43 = prolonged (>15 days). Rule 44 = -#' >=22 hours/day, no meaningful human contact. +#' Rules). A/RES/70/175. Rule 43 = prolonged (more than 15 days). +#' Rule 44 = at least 22 hours/day, no meaningful human contact. #' #' @export #' @examples @@ -62,13 +62,13 @@ mrm_otis_mandela_spectrum <- function( data, duration_col = "NumberConsecutiveDays_Segregation", - year_col = "EndFiscalYear", - id_col = "UniqueIndividual_ID", + year_col = "EndFiscalYear", + id_col = "UniqueIndividual_ID", threshold_days = 15L, - alert_cols = c("MentalHealth_Alert", "SuicideRisk_Alert", "SuicideWatch_Alert"), + alert_cols = c("MentalHealth_Alert", "SuicideRisk_Alert", "SuicideWatch_Alert"), contact_proxies = c("none", "any_alert", "no_alert"), - denominators = c("row", "individual_any", "individual_cumulative"), - c11_data = NULL + denominators = c("row", "individual_any", "individual_cumulative"), + c11_data = NULL ) { stopifnot(is.data.frame(data)) stopifnot(all(c(duration_col, year_col, id_col) %in% names(data))) @@ -81,11 +81,11 @@ mrm_otis_mandela_spectrum <- function( if (!c %in% names(d)) next yes <- tolower(trimws(as.character(d[[c]]))) == "yes" d[["_any_alert"]] <- d[["_any_alert"]] | yes - d[["_no_alert"]] <- d[["_no_alert"]] & !yes + d[["_no_alert"]] <- d[["_no_alert"]] & !yes } years <- sort(unique(d[[year_col]])) - rows <- list() + rows <- list() for (y in c(as.list(years), list("pooled"))) { if (identical(y, "pooled")) { @@ -108,7 +108,7 @@ mrm_otis_mandela_spectrum <- function( n_d <- sum(ymask) n_m <- sum(elig) } else if (denom == "individual_any") { - ids <- unique(stats::na.omit(d[[id_col]][ymask])) + ids <- unique(stats::na.omit(d[[id_col]][ymask])) ids_m <- unique(stats::na.omit(d[[id_col]][elig])) n_d <- length(ids) n_m <- length(ids_m) @@ -133,7 +133,9 @@ mrm_otis_mandela_spectrum <- function( grepl("Greater than", b) | any(as.integer(regmatches(b, gregexpr("[0-9]+", b))[[1]]) > threshold_days) }, logical(1)) n_m <- sum(sub[["NumberIndividuals_Segregation"]][ab]) - } else stop(sprintf("unknown denominator %s", denom)) + } else { + stop(sprintf("unknown denominator %s", denom)) + } rate <- if (n_d > 0) n_m / n_d else NA_real_ rows[[length(rows) + 1L]] <- data.frame( diff --git a/r-package/morie/R/mrm_mathstats.R b/r-package/morie/R/mrm_mathstats.R index 279789a5a9..8eeb6ac572 100644 --- a/r-package/morie/R/mrm_mathstats.R +++ b/r-package/morie/R/mrm_mathstats.R @@ -11,6 +11,10 @@ #' Casella, G. & Berger, R. L. (2002). Statistical Inference. Duxbury. #' Lehmann, E. L. & Romano, J. P. (2005). Testing Statistical Hypotheses. #' +#' @return Each callable returns a named \code{list} with the computed +#' statistic(s) and a plain-language \code{interpretation}. +#' @examples +#' mrm_oneprop_test(x = 58, n = 100, p0 = 0.5) #' @name mrm_mathstats NULL @@ -74,7 +78,8 @@ mrm_twoprop_test <- function(x1, n1, x2, n2, alpha = 0.05) { if (n1 <= 0 || n2 <= 0 || x1 < 0 || x2 < 0) { stop("invalid sample sizes / counts") } - p1 <- x1 / n1; p2 <- x2 / n2 + p1 <- x1 / n1 + p2 <- x2 / n2 tbl <- matrix(c(x1, n1 - x1, x2, n2 - x2), nrow = 2, byrow = TRUE) ch <- suppressWarnings(stats::chisq.test(tbl, correct = FALSE)) fi <- stats::fisher.test(tbl, alternative = "two.sided") @@ -87,7 +92,7 @@ mrm_twoprop_test <- function(x1, n1, x2, n2, alpha = 0.05) { p1 = round(p1, 6), p2 = round(p2, 6), diff = round(diff, 6), chi2 = round(as.numeric(ch$statistic), 4), - df = as.integer(ch$parameter), + df = as.integer(ch$parameter), p_value_chi2 = as.numeric(ch$p.value), p_value_fisher = as.numeric(fi$p.value), z_wald = round(z_w, 4), @@ -118,7 +123,8 @@ mrm_twoprop_test <- function(x1, n1, x2, n2, alpha = 0.05) { #' mrm_var_test(sample = x, sigma0_sq = 1) #' @export mrm_var_test <- function(sample, sigma0_sq, alpha = 0.05) { - x <- as.numeric(sample); x <- x[is.finite(x)] + x <- as.numeric(sample) + x <- x[is.finite(x)] n <- length(x) if (n < 2L) stop("need >= 2 observations") s_sq <- stats::var(x) @@ -162,7 +168,8 @@ mrm_var_test <- function(sample, sigma0_sq, alpha = 0.05) { #' # plot(qq$theoretical, qq$empirical); abline(0, 1) #' @export mrm_qq_plot <- function(sample, dist = "norm", ...) { - x <- sort(as.numeric(sample)); x <- x[is.finite(x)] + x <- sort(as.numeric(sample)) + x <- x[is.finite(x)] n <- length(x) if (n < 2L) stop("need >= 2 observations") p <- ((seq_len(n)) - 0.375) / (n + 0.25) @@ -191,22 +198,26 @@ mrm_qq_plot <- function(sample, dist = "norm", ...) { #' @examples #' # 1000 sample means of size 30 from an exponential(1) base; #' # standardised z-scores converge to N(0,1): -#' res <- mrm_clt_demo(base_distribution = "exp", -#' n_samples = 1000L, -#' sample_size = 30L, -#' seed = 42L, rate = 1) +#' res <- mrm_clt_demo( +#' base_distribution = "exp", +#' n_samples = 1000L, +#' sample_size = 30L, +#' seed = 42L, rate = 1 +#' ) #' summary(res$z_score) #' # mean ~ 0, sd ~ 1 #' @export mrm_clt_demo <- function(base_distribution = "unif", - n_samples = 1000L, - sample_size = 30L, - seed = 42L, ...) { + n_samples = 1000L, + sample_size = 30L, + seed = 42L, ...) { set.seed(seed) rfun <- get(paste0("r", base_distribution), envir = asNamespace("stats")) - means <- vapply(seq_len(n_samples), - function(i) mean(rfun(sample_size, ...)), - numeric(1)) + means <- vapply( + seq_len(n_samples), + function(i) mean(rfun(sample_size, ...)), + numeric(1) + ) data.frame( sample_index = seq_len(n_samples), sample_mean = means, @@ -231,13 +242,14 @@ mrm_clt_demo <- function(base_distribution = "unif", #' x <- rnorm(200) #' # Under correct distributional assumption, U should be ~Uniform(0,1): #' pit <- mrm_pit(x, dist = "norm") -#' attr(pit, "ks_pvalue") # large p-value => no evidence against fit +#' attr(pit, "ks_pvalue") # large p-value => no evidence against fit #' # If we deliberately misspecify (claim t_3 fits the normal sample): #' pit_wrong <- mrm_pit(x, dist = "t", df = 3) -#' attr(pit_wrong, "ks_pvalue") # small p-value => misspecification detected +#' attr(pit_wrong, "ks_pvalue") # small p-value => misspecification detected #' @export mrm_pit <- function(sample, dist = "norm", ...) { - x <- as.numeric(sample); x <- x[is.finite(x)] + x <- as.numeric(sample) + x <- x[is.finite(x)] pfun <- get(paste0("p", dist), envir = asNamespace("stats")) U <- pfun(x, ...) ks <- suppressWarnings(stats::ks.test(U, "punif")) diff --git a/r-package/morie/R/mrm_otis.R b/r-package/morie/R/mrm_otis.R index 32d60ce7e2..d0d84d0d5a 100644 --- a/r-package/morie/R/mrm_otis.R +++ b/r-package/morie/R/mrm_otis.R @@ -25,6 +25,14 @@ #' is randomly reassigned every fiscal year. Cross-year tracking is #' therefore invalid; all analyses below operate within fiscal year. #' +#' @return Each \code{mrm_otis_*()} callable returns a named \code{list} with +#' the computed statistics (concentration indices, survival curves, or +#' association measures) and a plain-language \code{interpretation}. +#' @examples +#' if (FALSE) { +#' b09 <- read.csv("b09_individuals_in_segregation.csv") +#' mrm_otis_placement_concentration(b09) +#' } #' @name mrm_otis NULL @@ -36,19 +44,25 @@ NULL .gini_int <- function(x) { x <- sort(as.numeric(x)) n <- length(x) - if (n == 0L || sum(x) == 0) return(NA_real_) + if (n == 0L || sum(x) == 0) { + return(NA_real_) + } (2 * sum(seq_len(n) * x) - (n + 1) * sum(x)) / (n * sum(x)) } .hill_mle <- function(x, x_min) { x <- x[x >= x_min] n <- length(x) - if (n < 2L) return(NA_real_) + if (n < 2L) { + return(NA_real_) + } 1 + n / sum(log(x / x_min)) } .cramer_v <- function(tbl) { - if (any(dim(tbl) < 2L)) return(NA_real_) + if (any(dim(tbl) < 2L)) { + return(NA_real_) + } chi <- suppressWarnings(stats::chisq.test(tbl, correct = FALSE)) k <- min(dim(tbl)) sqrt(as.numeric(chi$statistic) / (sum(tbl) * (k - 1))) @@ -122,7 +136,9 @@ mrm_otis_placement_concentration <- function( return(mean(nums)) } nums <- as.numeric(regmatches(s, regexpr("[0-9]+", s))) - if (length(nums) == 0L) return(NA_real_) + if (length(nums) == 0L) { + return(NA_real_) + } nums[1] } @@ -149,10 +165,12 @@ mrm_otis_placement_concentration <- function( x <- x[is.finite(x) & x > 0] n <- length(x) if (n == 0L) { - return(data.frame(year = label, n_individuals = 0, - n_placements = 0, mean_per_individual = NA_real_, - gini = NA_real_, hill_alpha = NA_real_, - top_pct_share = NA_real_)) + return(data.frame( + year = label, n_individuals = 0, + n_placements = 0, mean_per_individual = NA_real_, + gini = NA_real_, hill_alpha = NA_real_, + top_pct_share = NA_real_ + )) } x_sorted <- sort(x, decreasing = TRUE) cut <- max(1L, ceiling(top_pct * n)) @@ -233,10 +251,12 @@ mrm_otis_seg_duration_km <- function( d <- d[!is.na(d) & d > 0] n <- length(d) if (n == 0L) { - return(data.frame(stratum = s, n = 0, mean_days = NA_real_, - median_days = NA_real_, q25_days = NA_real_, - pct_above_mandela = NA_real_, - median_among_above_mandela = NA_real_)) + return(data.frame( + stratum = s, n = 0, mean_days = NA_real_, + median_days = NA_real_, q25_days = NA_real_, + pct_above_mandela = NA_real_, + median_among_above_mandela = NA_real_ + )) } above <- d > mandela_threshold data.frame( @@ -271,7 +291,7 @@ mrm_otis_seg_duration_km <- function( #' @param alert_cols Character vector of alert column names #' (default the three b01 alert columns). #' @return A data.frame with one row per pair, columns `alert_a`, -#' `alert_b`, `n`, `chi2`, `df`, `p_value`, `cramers_v`. +#' `alert_b`, `n`, `chi2`, `df`, `p_value`, `morie_cramers_v`. #' @export #' @examples #' if (FALSE) { @@ -288,7 +308,8 @@ mrm_otis_mortification_cooccurrence <- function( names(bins) <- alert_cols pairs <- utils::combn(alert_cols, 2L, simplify = FALSE) rows <- lapply(pairs, function(p) { - a <- bins[[p[1]]]; b <- bins[[p[2]]] + a <- bins[[p[1]]] + b <- bins[[p[2]]] keep <- !is.na(a) & !is.na(b) tbl <- table(a[keep], b[keep]) chi <- suppressWarnings(stats::chisq.test(tbl, correct = FALSE)) @@ -298,7 +319,7 @@ mrm_otis_mortification_cooccurrence <- function( chi2 = round(as.numeric(chi$statistic), 2), df = as.integer(chi$parameter), p_value = signif(chi$p.value, 3), - cramers_v = round(.cramer_v(tbl), 4) + morie_cramers_v = round(.cramer_v(tbl), 4) ) }) do.call(rbind, rows) @@ -324,7 +345,7 @@ mrm_otis_mortification_cooccurrence <- function( #' @param region_recent_col Column name of the most-recent region #' (default `"Region_MostRecentPlacement"`). #' @return A list with named elements `table` (the contingency matrix), -#' `chi2`, `df`, `p_value`, `cramers_v`, `diagonal_share`, +#' `chi2`, `df`, `p_value`, `morie_cramers_v`, `diagonal_share`, #' `off_diagonal_share`. #' @export #' @examples @@ -348,7 +369,7 @@ mrm_otis_region_locality <- function( chi2 = round(as.numeric(chi$statistic), 2), df = as.integer(chi$parameter), p_value = signif(chi$p.value, 3), - cramers_v = round(.cramer_v(tbl), 4), + morie_cramers_v = round(.cramer_v(tbl), 4), diagonal_share = round(diag_sum / total, 4), off_diagonal_share = round(1 - diag_sum / total, 4) ) diff --git a/r-package/morie/R/mrm_samples.R b/r-package/morie/R/mrm_samples.R index fc49d47678..72a040e68a 100644 --- a/r-package/morie/R/mrm_samples.R +++ b/r-package/morie/R/mrm_samples.R @@ -13,10 +13,19 @@ #' * TPS: Toronto Police Open Data ArcGIS REST. Use #' `morie_fetch_tps(category = "Assault")`. #' * SIU: Ontario SIU Director's Reports site. Use -#' `morie_fetch_siu()` which scrapes the public reports site on -#' demand (per-user, since redistribution of the scraped corpus is +#' `morie_fetch_siu()` which parses the public reports site on +#' demand (per-user, since redistribution of the parsed corpus is #' not clearly licensed). #' +#' @return The on-demand fetchers (\code{morie_fetch_tps()}, +#' \code{morie_fetch_siu()}) return the file path to the downloaded or +#' cached CSV; \code{morie_load_dataset()} returns the loaded +#' \code{data.frame}. +#' @examples +#' if (FALSE) { +#' b01 <- morie_load_dataset("otisb01") +#' head(b01) +#' } #' @name mrm_samples NULL @@ -60,8 +69,8 @@ morie_sample <- function(name = c("otis_b01", "otis_b09", "otis_c11", "tps_assau #' FeatureServer layer roots. #' @examples #' urls <- morie_tps_layer_urls() -#' names(urls) # categories: Assault, AutoTheft, Homicide, ... -#' length(urls) # number of layers +#' names(urls) # categories: Assault, AutoTheft, Homicide, ... +#' length(urls) # number of layers #' @export morie_tps_layer_urls <- function() { c( @@ -74,13 +83,16 @@ morie_tps_layer_urls <- function() { BreakAndEnter = "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Break_and_Enter_Open_Data/FeatureServer/0", Homicides = - "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Homicides_Open_Data_ASR_RC_TBL_002/FeatureServer/0", + paste0("https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/", + "rest/services/Homicides_Open_Data_ASR_RC_TBL_002/FeatureServer/0"), Robbery = "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Robbery_Open_Data/FeatureServer/0", ShootingAndFirearmDiscarges = - "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Shooting_and_Firearm_Discharges_Open_Data/FeatureServer/0", + paste0("https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/", + "rest/services/Shooting_and_Firearm_Discharges_Open_Data/FeatureServer/0"), TheftFromMV = - "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Theft_From_Motor_Vehicle_Open_Data/FeatureServer/0", + paste0("https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/", + "rest/services/Theft_From_Motor_Vehicle_Open_Data/FeatureServer/0"), TheftOver = "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Theft_Over_Open_Data/FeatureServer/0" ) @@ -94,8 +106,11 @@ morie_tps_layer_urls <- function() { #' calls unless `overwrite = TRUE`. #' #' @param category One of `names(morie_tps_layer_urls())`. -#' @param cache_dir Directory for the CSV -#' (default `"~/.cache/morie/tps"`). +#' @param cache_dir Directory for the CSV. Defaults to a +#' session-scoped subdirectory of `tempdir()` that R cleans up +#' automatically. For persistent caching pass +#' `cache_dir = morie_cache_dir("tps")`; see +#' [morie_cache_dir] and [morie_cache_clear]. #' @param where ArcGIS SQL where clause (default `"1=1"`). #' @param overwrite Logical; if `FALSE` and the CSV exists, return its #' path without re-downloading. @@ -103,26 +118,30 @@ morie_tps_layer_urls <- function() { #' @return Path to the CSV. #' @examples #' \dontrun{ -#' # Network: fetches major-crime indicators from the Toronto Police -#' # ArcGIS open-data layer. -#' csv <- morie_fetch_tps(category = "Assault", -#' cache_dir = tempdir(), -#' where = "OCC_YEAR = 2024") -#' tps <- utils::read.csv(csv) -#' nrow(tps) +#' # Network: fetches major-crime indicators from the Toronto Police +#' # ArcGIS open-data layer. +#' csv <- morie_fetch_tps( +#' category = "Assault", +#' cache_dir = tempdir(), +#' where = "OCC_YEAR = 2024" +#' ) +#' tps <- utils::read.csv(csv) +#' nrow(tps) #' } #' @export morie_fetch_tps <- function( category, - cache_dir = "~/.cache/morie/tps", + cache_dir = file.path(tempdir(), "morie", "tps"), where = "1=1", overwrite = FALSE, max_per_page = 2000L ) { urls <- morie_tps_layer_urls() if (!category %in% names(urls)) { - stop("Unknown TPS category. Known: ", - paste(names(urls), collapse = ", ")) + stop( + "Unknown TPS category. Known: ", + paste(names(urls), collapse = ", ") + ) } if (!requireNamespace("jsonlite", quietly = TRUE)) { stop("jsonlite required for morie_fetch_tps().") @@ -130,31 +149,46 @@ morie_fetch_tps <- function( cache_dir <- path.expand(cache_dir) dir.create(cache_dir, showWarnings = FALSE, recursive = TRUE) out <- file.path(cache_dir, paste0("tps_", category, ".csv")) - if (file.exists(out) && !overwrite) return(out) + if (file.exists(out) && !overwrite) { + return(out) + } base <- urls[[category]] offset <- 0L rows <- list() - while (TRUE) { + repeat { url <- sprintf( "%s/query?where=%s&outFields=*&returnGeometry=true&f=geojson&resultRecordCount=%d&resultOffset=%d", base, utils::URLencode(where, reserved = TRUE), max_per_page, offset ) page <- tryCatch(jsonlite::fromJSON(url, simplifyVector = FALSE), - error = function(e) NULL) - if (is.null(page)) break + error = function(e) NULL + ) + if (is.null(page)) { + # Abort loudly rather than silently caching a partial download: + # a transient failure mid-paging must not be written to disk and + # then returned as if it were the complete layer. + stop( + "morie_fetch_tps(): TPS ArcGIS request failed at offset ", + offset, " for category '", category, "'." + ) + } feats <- page$features if (length(feats) == 0L) break for (f in feats) { r <- f$properties if (!is.null(f$geometry) && identical(f$geometry$type, "Point")) { r$LONG_WGS84 <- f$geometry$coordinates[[1]] - r$LAT_WGS84 <- f$geometry$coordinates[[2]] + r$LAT_WGS84 <- f$geometry$coordinates[[2]] } rows[[length(rows) + 1L]] <- r } - if (length(feats) < max_per_page) break offset <- offset + length(feats) + # Page on the server's exceededTransferLimit flag, NOT on a short + # page: a layer whose server-side maxRecordCount is below + # max_per_page returns short pages on every call, so breaking on a + # short page would silently truncate the download to page one. + if (!isTRUE(page$exceededTransferLimit)) break } if (length(rows) == 0L) stop("No features returned for ", category) df <- do.call(rbind, lapply(rows, function(r) as.data.frame(r, stringsAsFactors = FALSE))) @@ -163,49 +197,5 @@ morie_fetch_tps <- function( } -# --------------------------------------------------------------------------- -# SIU on-demand scraper (placeholder wrapper around the Python implementation) -# --------------------------------------------------------------------------- - -#' Fetch Ontario SIU Director's Reports into a local CSV -#' -#' R wrapper around the Python `morie.siu_fetch.fetch_siu_cases()` -#' on-demand scraper. The R version delegates via `reticulate` so the -#' regex / HTML parsing lives in a single canonical location. -#' -#' The scraped corpus is NOT shipped with the package; each user runs -#' the scraper themselves, which is unambiguously fair use of public -#' oversight reports. -#' -#' @param years Optional integer vector of years to scrape. `NULL` -#' (default) scrapes the full unfiltered index. -#' @param cache_dir Output directory (default `"~/.cache/morie/siu"`). -#' @param overwrite Logical; if `FALSE` and `SIU.csv` exists, returns -#' its path without rescraping. -#' @return Path to the populated SIU.csv. -#' @examples -#' \dontrun{ -#' # Network: scrapes the Ontario SIU Director's Reports site. -#' csv <- morie_fetch_siu(years = 2023:2024, -#' cache_dir = tempdir()) -#' siu <- utils::read.csv(csv) -#' table(siu$year) -#' } -#' @export -morie_fetch_siu <- function( - years = NULL, - cache_dir = "~/.cache/morie/siu", - overwrite = FALSE -) { - if (!requireNamespace("reticulate", quietly = TRUE)) { - stop("reticulate required for morie_fetch_siu().") - } - py <- reticulate::import("morie.siu_fetch", convert = FALSE) - out <- py$fetch_siu_cases( - years = if (is.null(years)) NULL else as.integer(years), - cache_dir = path.expand(cache_dir), - overwrite = overwrite, - progress = TRUE - ) - as.character(out) -} +# morie_fetch_siu() now lives in R/siu.R -- it drives the all-C/C++ +# SIU parser (src/siu_parser.cpp) instead of the Python module. diff --git a/r-package/morie/R/mrm_siu.R b/r-package/morie/R/mrm_siu.R index 82ed982ffa..acfcfd9b2b 100644 --- a/r-package/morie/R/mrm_siu.R +++ b/r-package/morie/R/mrm_siu.R @@ -17,11 +17,19 @@ #' categories (`charges_laid`, `no_charges`, etc.) by service and #' by year, reporting both raw counts and shares. #' +#' @return Each \code{mrm_siu_*()} callable returns a named \code{list} with +#' the survival, per-service rate, or outcome-classification result and a +#' plain-language \code{interpretation}. +#' @examples +#' if (FALSE) { +#' siu <- read.csv("SIU.csv") +#' mrm_siu_case_to_decision_km(siu) +#' } #' @name mrm_siu NULL -.parse_iso <- function(x) suppressWarnings(as.Date(x)) +.parse_iso <- function(x) suppressWarnings(as.Date(x, format = "%Y-%m-%d")) # --------------------------------------------------------------------------- @@ -92,14 +100,18 @@ mrm_siu_case_to_decision_km <- function( observed <- keep_inc & (!is.na(dec) | censor_open_cases) ok <- observed & is.finite(gap) & gap >= 0 - gap <- gap[ok]; svc <- svc[ok]; censored <- censored[ok] + gap <- gap[ok] + svc <- svc[ok] + censored <- censored[ok] summarise <- function(gap_v, cens_v, label) { if (length(gap_v) == 0L) { - return(data.frame(stratum = label, n = 0L, n_censored = 0L, - median_days = NA_real_, mean_days = NA_real_, - p25_days = NA_real_, p75_days = NA_real_, - max_days = NA_real_)) + return(data.frame( + stratum = label, n = 0L, n_censored = 0L, + median_days = NA_real_, mean_days = NA_real_, + p25_days = NA_real_, p75_days = NA_real_, + max_days = NA_real_ + )) } data.frame( stratum = label, @@ -116,10 +128,12 @@ mrm_siu_case_to_decision_km <- function( pooled <- summarise(gap, censored, "pooled") by_svc <- by(seq_along(gap), svc, function(idx) { - if (length(idx) < min_n) return(NULL) + if (length(idx) < min_n) { + return(NULL) + } summarise(gap[idx], censored[idx], unique(svc[idx])) }, simplify = FALSE) - by_svc <- do.call(rbind, by_svc[!sapply(by_svc, is.null)]) + by_svc <- do.call(rbind, by_svc[!vapply(by_svc, is.null, logical(1))]) if (!is.null(by_svc)) rownames(by_svc) <- NULL list(pooled = pooled, by_service = by_svc) @@ -210,9 +224,14 @@ mrm_siu_outcome_classifier <- function( ) { stopifnot(is.data.frame(data)) if (!outcome_col %in% names(data)) { - for (alt in c("director_decision", "outcome", "decision", - "director_decision_outcome", "director_decision_text")) { - if (alt %in% names(data)) { outcome_col <- alt; break } + for (alt in c( + "director_decision", "outcome", "decision", + "director_decision_outcome", "director_decision_text" + )) { + if (alt %in% names(data)) { + outcome_col <- alt + break + } } } stopifnot(outcome_col %in% names(data)) @@ -220,7 +239,8 @@ mrm_siu_outcome_classifier <- function( out <- as.character(data[[outcome_col]]) svc <- as.character(data[[service_col]]) ok <- !is.na(out) & nchar(out) > 0 & !is.na(svc) & nchar(svc) > 0 - out <- out[ok]; svc <- svc[ok] + out <- out[ok] + svc <- svc[ok] tab <- table(svc, out) totals <- rowSums(tab) df <- as.data.frame.table(tab, responseName = "n_cases") diff --git a/r-package/morie/R/mrm_tps.R b/r-package/morie/R/mrm_tps.R index b1f8037c87..37f52bbc54 100644 --- a/r-package/morie/R/mrm_tps.R +++ b/r-package/morie/R/mrm_tps.R @@ -15,6 +15,15 @@ #' precomputed per-category Hawkes (Markovian + Weibull/sin) #' fits from the `paper_hawkes_refit.json` manifest if available. #' +#' @return Each \code{mrm_tps_*()} callable returns a named \code{list} with +#' the computed statistic (Pareto exponent, Moran's I, or survival curve) +#' and a plain-language \code{interpretation}; \code{mrm_tps_load_hawkes_refit()} +#' returns the parsed Hawkes-refit manifest as a \code{list}. +#' @examples +#' if (FALSE) { +#' tps <- read.csv("Assault_Open_Data.csv") +#' mrm_tps_levy_scaling(tps) +#' } #' @name mrm_tps NULL @@ -78,11 +87,16 @@ mrm_tps_levy_scaling <- function( lat <- data[[lat_col]][ord] lon <- data[[lon_col]][ord] keep <- is.finite(lat) & is.finite(lon) - lat <- lat[keep]; lon <- lon[keep] + lat <- lat[keep] + lon <- lon[keep] n <- length(lat) - if (n < 2L) return(list(n_events = n, n_steps_tail = 0L, - min_step_km = min_step_km, hill_alpha = NA_real_)) + if (n < 2L) { + return(list( + n_events = n, n_steps_tail = 0L, + min_step_km = min_step_km, hill_alpha = NA_real_ + )) + } step <- .haversine_km(lat[-n], lon[-n], lat[-1], lon[-1]) tail <- step[step >= min_step_km] alpha <- if (length(tail) >= 2L) 1 + length(tail) / sum(log(tail / x_min)) else NA_real_ @@ -136,14 +150,18 @@ mrm_tps_moran_clustering <- function( ) { stopifnot(is.data.frame(data)) stopifnot(all(c(lat_col, lon_col) %in% names(data))) - lat <- data[[lat_col]]; lon <- data[[lon_col]] + lat <- data[[lat_col]] + lon <- data[[lon_col]] keep <- is.finite(lat) & is.finite(lon) - lat <- lat[keep]; lon <- lon[keep] + lat <- lat[keep] + lon <- lon[keep] n <- length(lat) if (n < 10L) { - return(list(morans_I = NA_real_, morans_z = NA_real_, - dbscan_n_clusters = 0L, dbscan_n_noise = 0L, - dbscan_largest = 0L)) + return(list( + morans_I = NA_real_, morans_z = NA_real_, + dbscan_n_clusters = 0L, dbscan_n_noise = 0L, + dbscan_largest = 0L + )) } # --- Moran via raster counts --- @@ -156,18 +174,21 @@ mrm_tps_moran_clustering <- function( z <- as.vector(counts) - mean(counts) N <- length(z) # rook contiguity neighbour pairs - W_sum <- 0; num <- 0 - for (i in seq_len(grid_resolution)) for (j in seq_len(grid_resolution)) { - if (i < grid_resolution) { - num <- num + z[(j - 1) * grid_resolution + i] * z[(j - 1) * grid_resolution + i + 1L] - W_sum <- W_sum + 1 - } - if (j < grid_resolution) { - num <- num + z[(j - 1) * grid_resolution + i] * z[j * grid_resolution + i] - W_sum <- W_sum + 1 + W_sum <- 0 + num <- 0 + for (i in seq_len(grid_resolution)) { + for (j in seq_len(grid_resolution)) { + if (i < grid_resolution) { + num <- num + z[(j - 1) * grid_resolution + i] * z[(j - 1) * grid_resolution + i + 1L] + W_sum <- W_sum + 1 + } + if (j < grid_resolution) { + num <- num + z[(j - 1) * grid_resolution + i] * z[j * grid_resolution + i] + W_sum <- W_sum + 1 + } } } - num <- 2 * num # rook is symmetric + num <- 2 * num # rook is symmetric W_sum <- 2 * W_sum morans_I <- (N / W_sum) * num / sum(z^2) # Approximate z-score under randomisation H0 (E[I] = -1/(N-1)) @@ -184,7 +205,9 @@ mrm_tps_moran_clustering <- function( n_noise <- sum(cl == 0L) largest <- if (n_clusters > 0L) max(table(cl[cl != 0L])) else 0L } else { - n_clusters <- NA_integer_; n_noise <- NA_integer_; largest <- NA_integer_ + n_clusters <- NA_integer_ + n_noise <- NA_integer_ + largest <- NA_integer_ } list( @@ -240,14 +263,20 @@ mrm_tps_neighbourhood_recurrence_km <- function( d <- as.Date(d) h <- data[[hood_col]] ok <- !is.na(d) & !is.na(h) - d <- d[ok]; h <- h[ok] + d <- d[ok] + h <- h[ok] ord <- order(h, d) - h <- h[ord]; d <- d[ord] + h <- h[ord] + d <- d[ord] rows <- by(d, h, function(dd) { - if (length(dd) < 2L) return(NULL) + if (length(dd) < 2L) { + return(NULL) + } gaps <- as.numeric(diff(dd)) gaps <- gaps[gaps >= min_gap_days] - if (length(gaps) == 0L) return(NULL) + if (length(gaps) == 0L) { + return(NULL) + } data.frame( n_events = length(dd), n_gaps = length(gaps), @@ -257,8 +286,10 @@ mrm_tps_neighbourhood_recurrence_km <- function( p75_gap_days = stats::quantile(gaps, 0.75, names = FALSE) ) }, simplify = FALSE) - out <- do.call(rbind, Map(cbind, hood = names(rows[!sapply(rows, is.null)]), - rows[!sapply(rows, is.null)])) + out <- do.call(rbind, Map(cbind, + hood = names(rows[!vapply(rows, is.null, logical(1))]), + rows[!vapply(rows, is.null, logical(1))] + )) rownames(out) <- NULL out } diff --git a/r-package/morie/R/mtgbl.R b/r-package/morie/R/mtgbl.R index 7f9097c5a6..01a03e9043 100644 --- a/r-package/morie/R/mtgbl.R +++ b/r-package/morie/R/mtgbl.R @@ -10,25 +10,32 @@ #' @return list(estimate, G_hat, B_hat, Sigma_g, Sigma_e, n, t, method). #' @references Montesinos Lopez Ch 10. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_multi_trait_gblup( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -multi_trait_gblup <- function(x, y, markers, Sigma_g = NULL, Sigma_e = NULL) { - Y <- as.matrix(y); n <- nrow(Y); t <- ncol(Y) +morie_multi_trait_gblup <- function(x, y, markers, Sigma_g = NULL, Sigma_e = NULL) { + Y <- as.matrix(y) + n <- nrow(Y) + t <- ncol(Y) M <- as.matrix(markers) - G_mat <- grm_vanraden(M, method = 1)$estimate + 1e-6 * diag(n) + G_mat <- morie_grm_vanraden(M, method = 1)$estimate + 1e-6 * diag(n) cand <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { matrix(1, n, 1) - } else cbind(1, as.matrix(x)) - qrx <- qr(cand); X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] + } else { + cbind(1, as.matrix(x)) + } + qrx <- qr(cand) + X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] if (is.null(Sigma_g) || is.null(Sigma_e)) { h2 <- 0.5 S_y <- if (t > 1) stats::cov(Y) else matrix(stats::var(as.numeric(Y)), 1, 1) - Sigma_g <- h2 * S_y; Sigma_e <- (1 - h2) * S_y + Sigma_g <- h2 * S_y + Sigma_e <- (1 - h2) * S_y } - Sigma_g <- as.matrix(Sigma_g); Sigma_e <- as.matrix(Sigma_e) + Sigma_g <- as.matrix(Sigma_g) + Sigma_e <- as.matrix(Sigma_e) B <- stats::lsfit(X, Y, intercept = FALSE)$coefficients if (!is.matrix(B)) B <- matrix(B, ncol = t) R <- Y - X %*% B @@ -37,11 +44,13 @@ multi_trait_gblup <- function(x, y, markers, Sigma_g = NULL, Sigma_e = NULL) { SG <- kronecker(Sigma_g, G_mat) g_vec <- SG %*% solve(V, vec_R) G_hat <- matrix(g_vec, n, t) - list(estimate = mean(G_hat), G_hat = G_hat, B_hat = B, - Sigma_g = Sigma_g, Sigma_e = Sigma_e, - n = n, t = t, method = "Multi-trait GBLUP (vec-stacked MME)") + list( + estimate = mean(G_hat), G_hat = G_hat, B_hat = B, + Sigma_g = Sigma_g, Sigma_e = Sigma_e, + n = n, t = t, method = "Multi-trait GBLUP (vec-stacked MME)" + ) } # CANONICAL TEST # set.seed(5); M <- matrix(sample(0:2, 48, TRUE), 6, 8) -# Y <- matrix(rnorm(12), 6, 2); multi_trait_gblup(rep(0,6), Y, M) +# Y <- matrix(rnorm(12), 6, 2); morie_multi_trait_gblup(rep(0,6), Y, M) diff --git a/r-package/morie/R/mxpol.R b/r-package/morie/R/mxpol.R index f964bb1f07..3f7f35284a 100644 --- a/r-package/morie/R/mxpol.R +++ b/r-package/morie/R/mxpol.R @@ -12,18 +12,18 @@ #' @return Named list \code{(y, estimate, argmax, output_shape, method)}. #' @references Goodfellow et al. (2016), Deep Learning, Ch 9.3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -mxpol_maxpool_forward <- function(x, kernel_size = 2L, stride = NULL) { +morie_mxpol_maxpool_forward <- function(x, kernel_size = 2L, stride = NULL) { x <- as.matrix(x) k <- as.integer(kernel_size) s <- if (is.null(stride)) k else as.integer(stride) - H <- nrow(x); W <- ncol(x) - if (H < k || W < k) + H <- nrow(x) + W <- ncol(x) + if (H < k || W < k) { stop(sprintf("Input (%d,%d) smaller than kernel %d", H, W, k)) + } out_h <- (H - k) %/% s + 1L out_w <- (W - k) %/% s + 1L y <- matrix(0, out_h, out_w) @@ -37,12 +37,14 @@ mxpol_maxpool_forward <- function(x, kernel_size = 2L, stride = NULL) { argmax[i, j] <- which.max(block) - 1L } } - list(y = y, estimate = y, argmax = argmax, - output_shape = c(out_h, out_w), - method = "MaxPool2D forward") + list( + y = y, estimate = y, argmax = argmax, + output_shape = c(out_h, out_w), + method = "MaxPool2D forward" + ) } -#' @rdname mxpol_maxpool_forward +#' @rdname morie_mxpol_maxpool_forward #' @keywords internal #' @export -maxpool_forward <- mxpol_maxpool_forward +morie_maxpool_forward <- morie_mxpol_maxpool_forward diff --git a/r-package/morie/R/nbeat.R b/r-package/morie/R/nbeat.R index d89a45955c..8b0302becb 100644 --- a/r-package/morie/R/nbeat.R +++ b/r-package/morie/R/nbeat.R @@ -10,29 +10,36 @@ #' @return Named list with \code{forecast, fitted, trend, seasonal, #' theta_trend, theta_seasonal, r2, n, horizon, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_nbeats_basis(x = rnorm(50)) #' @export -nbeats_basis <- function(x, horizon = 1, n_trend = 3, n_season = 5, - period = 12) { - y <- as.numeric(x); n <- length(y) - if (n < n_trend + 2 * n_season + 2) +morie_nbeats_basis <- function(x, horizon = 1, n_trend = 3, n_season = 5, + period = 12) { + y <- as.numeric(x) + n <- length(y) + if (n < n_trend + 2 * n_season + 2) { stop("Series too short for chosen basis.") + } t <- seq(0, n - 1) - Tmat <- sapply(0:n_trend, function(k) t^k) - Smat <- do.call(cbind, lapply(seq_len(n_season), function(j) - cbind(sin(2 * pi * j * t / period), - cos(2 * pi * j * t / period)))) + Tmat <- vapply(0:n_trend, function(k) t^k, numeric(length(t))) + Smat <- do.call(cbind, lapply(seq_len(n_season), function(j) { + cbind( + sin(2 * pi * j * t / period), + cos(2 * pi * j * t / period) + ) + })) Xmat <- cbind(Tmat, Smat) coef <- lsfit(Xmat, y, intercept = FALSE)$coef fitted_y <- as.numeric(Xmat %*% coef) tf <- seq(n, n + horizon - 1) - Tf <- sapply(0:n_trend, function(k) tf^k) - Sf <- do.call(cbind, lapply(seq_len(n_season), function(j) - cbind(sin(2 * pi * j * tf / period), - cos(2 * pi * j * tf / period)))) + Tf <- matrix(vapply(0:n_trend, function(k) tf^k, numeric(length(tf))), + nrow = length(tf) + ) + Sf <- do.call(cbind, lapply(seq_len(n_season), function(j) { + cbind( + sin(2 * pi * j * tf / period), + cos(2 * pi * j * tf / period) + ) + })) Xf <- cbind(Tf, Sf) forecast <- as.numeric(Xf %*% coef) theta_trend <- coef[seq_len(n_trend + 1)] @@ -41,10 +48,14 @@ nbeats_basis <- function(x, horizon = 1, n_trend = 3, n_season = 5, seasonal <- as.numeric(Smat %*% theta_season) ss_tot <- sum((y - mean(y))^2) r2 <- if (ss_tot > 0) 1 - sum((y - fitted_y)^2) / ss_tot else NA_real_ - list(forecast = forecast, fitted = fitted_y, - trend = trend, seasonal = seasonal, - theta_trend = theta_trend, theta_seasonal = theta_season, - r2 = r2, n = n, horizon = horizon, - method = sprintf("N-BEATS basis: poly(P=%d) + Fourier(H=%d, period=%d)", - n_trend, n_season, period)) + list( + forecast = forecast, fitted = fitted_y, + trend = trend, seasonal = seasonal, + theta_trend = theta_trend, theta_seasonal = theta_season, + r2 = r2, n = n, horizon = horizon, + method = sprintf( + "N-BEATS basis: poly(P=%d) + Fourier(H=%d, period=%d)", + n_trend, n_season, period + ) + ) } diff --git a/r-package/morie/R/nstat.R b/r-package/morie/R/nstat.R index 033052ed99..bf2f8b8832 100644 --- a/r-package/morie/R/nstat.R +++ b/r-package/morie/R/nstat.R @@ -11,35 +11,41 @@ #' n, method. #' @references Sampson & Guttorp (1992); Schabenberger & Gotway (2005), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' nstat(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export nstat <- function(x, coords, bandwidth = NULL) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } if (nrow(coords) != n) stop("coords rows must match length(x)") D <- as.matrix(stats::dist(coords)) - if (is.null(bandwidth)) + if (is.null(bandwidth)) { bandwidth <- if (n > 1) stats::median(D[D > 0]) else 1 + } if (bandwidth <= 0) bandwidth <- 1 - K <- exp(-0.5 * (D / bandwidth) ^ 2) + K <- exp(-0.5 * (D / bandwidth)^2) wsum <- rowSums(K) mu_local <- as.numeric((K %*% x) / pmax(wsum, 1)) dev <- x - mu_local - var_local <- as.numeric((K %*% (dev ^ 2)) / pmax(wsum, 1)) + var_local <- as.numeric((K %*% (dev^2)) / pmax(wsum, 1)) sigma_local <- sqrt(pmax(var_local, 1e-12)) eps <- dev / sigma_local rho <- K * outer(eps, eps) / (sqrt(outer(wsum, wsum)) + 1e-12) Cmat <- outer(sigma_local, sigma_local) * rho - list(estimate = list(sigma_local = sigma_local, C_matrix = Cmat, - bandwidth = bandwidth), - n = n, method = "Non-stationary covariance (moving-window kernel)") + list( + estimate = list( + sigma_local = sigma_local, C_matrix = Cmat, + bandwidth = bandwidth + ), + n = n, method = "Non-stationary covariance (moving-window kernel)" + ) } #' @rdname nstat #' @keywords internal #' @export -nonstationary_covariance <- nstat +morie_nonstationary_covariance <- nstat diff --git a/r-package/morie/R/okrig.R b/r-package/morie/R/okrig.R index 3d594fb3b9..3b984d4d5d 100644 --- a/r-package/morie/R/okrig.R +++ b/r-package/morie/R/okrig.R @@ -14,51 +14,66 @@ #' @return Named list: estimate, se, n, method. #' @references Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' okrig(x = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) #' @export okrig <- function(x, coords, target, model = "exponential", nugget = 0, sill = 1, range_ = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) - target <- if (is.matrix(target)) target else + } + target <- if (is.matrix(target)) { + target + } else { matrix(as.numeric(unlist(target)), ncol = ncol(coords)) + } if (nrow(coords) != n) stop("coords rows must match length(x)") if (ncol(target) != ncol(coords)) stop("target dim mismatch") - c0 <- nugget; c1 <- sill - nugget; a <- range_ + c0 <- nugget + c1 <- sill - nugget + a <- range_ cov_fn <- function(h) { switch(model, - exponential = c1 * exp(-h / a) + ifelse(h == 0, c0, 0), - gaussian = c1 * exp(-(h ^ 2) / (a ^ 2)) + ifelse(h == 0, c0, 0), - spherical = ifelse(h <= a, - c1 * (1 - 1.5 * h / a + 0.5 * (h / a) ^ 3), - 0) + ifelse(h == 0, c0, 0), - stop("unknown model")) + exponential = c1 * exp(-h / a) + ifelse(h == 0, c0, 0), + gaussian = c1 * exp(-(h^2) / (a^2)) + ifelse(h == 0, c0, 0), + spherical = ifelse(h <= a, + c1 * (1 - 1.5 * h / a + 0.5 * (h / a)^3), + 0 + ) + ifelse(h == 0, c0, 0), + stop("unknown model") + ) } Dnn <- as.matrix(stats::dist(coords)) C <- cov_fn(Dnn) A <- matrix(0, n + 1, n + 1) - A[1:n, 1:n] <- C; A[1:n, n + 1] <- 1; A[n + 1, 1:n] <- 1 + A[1:n, 1:n] <- C + A[1:n, n + 1] <- 1 + A[n + 1, 1:n] <- 1 total_var <- c0 + c1 m <- nrow(target) - ests <- numeric(m); ses <- numeric(m) + ests <- numeric(m) + ses <- numeric(m) for (k in seq_len(m)) { - d0 <- sqrt(colSums((t(coords) - target[k, ]) ^ 2)) + d0 <- sqrt(colSums((t(coords) - target[k, ])^2)) c_vec <- cov_fn(d0) rhs <- c(c_vec, 1) sol <- tryCatch(solve(A, rhs), - error = function(e) qr.solve(A, rhs)) - lam <- sol[1:n]; mu <- sol[n + 1] + error = function(e) qr.solve(A, rhs) + ) + lam <- sol[1:n] + mu <- sol[n + 1] ests[k] <- sum(lam * x) var_pred <- max(total_var - sum(lam * c_vec) - mu, 0) ses[k] <- sqrt(var_pred) } - list(estimate = if (m == 1) ests[1] else ests, - se = if (m == 1) ses[1] else ses, - n = n, method = sprintf("Ordinary kriging (%s)", model)) + list( + estimate = if (m == 1) ests[1] else ests, + se = if (m == 1) ses[1] else ses, + n = n, method = sprintf("Ordinary kriging (%s)", model) + ) } # CANONICAL TEST @@ -68,4 +83,4 @@ okrig <- function(x, coords, target, model = "exponential", #' @rdname okrig #' @keywords internal #' @export -ordinary_kriging <- okrig +morie_ordinary_kriging <- okrig diff --git a/r-package/morie/R/optcl.R b/r-package/morie/R/optcl.R index 7907d737a5..54c6a079bb 100644 --- a/r-package/morie/R/optcl.R +++ b/r-package/morie/R/optcl.R @@ -11,39 +11,52 @@ #' @return Named list with `cut`, `correct_class`, `polarity`, `pre`, #' `n`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' optcl(x = rnorm(50)) #' @export optcl <- function(x, votes = NULL) { - x <- as.numeric(x); n <- length(x) + x <- as.numeric(x) + n <- length(x) if (n == 0L) { - return(list(cut = NA_real_, correct_class = 0L, polarity = 1L, - pre = NA_real_, n = 0L, method = "optimal_classification")) + return(list( + cut = NA_real_, correct_class = 0L, polarity = 1L, + pre = NA_real_, n = 0L, method = "morie_optimal_classification" + )) } if (is.null(votes)) { - return(list(cut = stats::median(x), - correct_class = as.integer(n %/% 2L + n %% 2L), - polarity = 1L, pre = NA_real_, n = n, - method = "optimal_classification")) + return(list( + cut = stats::median(x), + correct_class = as.integer(n %/% 2L + n %% 2L), + polarity = 1L, pre = NA_real_, n = n, + method = "morie_optimal_classification" + )) } y <- as.integer(votes) xs <- sort(x) cand <- c(xs[1] - 1, (xs[-length(xs)] + xs[-1]) / 2, xs[length(xs)] + 1) - best_cc <- -1L; best_cut <- stats::median(x); best_pol <- 1L - for (c in cand) for (pol in c(1L, -1L)) { - pred <- if (pol == 1L) as.integer(x > c) else as.integer(x <= c) - cc <- sum(pred == y) - if (cc > best_cc) { best_cc <- cc; best_cut <- c; best_pol <- pol } + best_cc <- -1L + best_cut <- stats::median(x) + best_pol <- 1L + for (c in cand) { + for (pol in c(1L, -1L)) { + pred <- if (pol == 1L) as.integer(x > c) else as.integer(x <= c) + cc <- sum(pred == y) + if (cc > best_cc) { + best_cc <- cc + best_cut <- c + best_pol <- pol + } + } } - p <- mean(y); base_correct <- max(p, 1 - p) * n + p <- mean(y) + base_correct <- max(p, 1 - p) * n pre <- if (n > base_correct) (best_cc - base_correct) / (n - base_correct) else 0 - list(cut = best_cut, correct_class = best_cc, polarity = best_pol, - pre = pre, n = n, method = "optimal_classification") + list( + cut = best_cut, correct_class = best_cc, polarity = best_pol, + pre = pre, n = n, method = "morie_optimal_classification" + ) } #' @keywords internal #' @rdname optcl #' @export -optimal_classification <- optcl +morie_optimal_classification <- optcl diff --git a/r-package/morie/R/ordct.R b/r-package/morie/R/ordct.R index a16cb3efb3..9aa94daa44 100644 --- a/r-package/morie/R/ordct.R +++ b/r-package/morie/R/ordct.R @@ -12,18 +12,20 @@ #' @return Named list: statistic (M^2), p_value, df, n, correlation. #' @importFrom stats cor pchisq #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ordered_categories(x = rnorm(50)) #' @export -ordered_categories <- function(x, row_scores = NULL, col_scores = NULL) { - X <- as.matrix(x); storage.mode(X) <- "numeric" - r <- nrow(X); c <- ncol(X); n_total <- sum(X) +morie_ordered_categories <- function(x, row_scores = NULL, col_scores = NULL) { + X <- as.matrix(x) + storage.mode(X) <- "numeric" + r <- nrow(X) + c <- ncol(X) + n_total <- sum(X) if (r < 2 || c < 2 || n_total < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, df = 1L, - n = as.integer(n_total), correlation = NA_real_, - method = "Linear-by-linear association")) + return(list( + statistic = NA_real_, p_value = NA_real_, df = 1L, + n = as.integer(n_total), correlation = NA_real_, + method = "Linear-by-linear association" + )) } if (is.null(row_scores)) row_scores <- seq_len(r) if (is.null(col_scores)) col_scores <- seq_len(c) @@ -31,9 +33,11 @@ ordered_categories <- function(x, row_scores = NULL, col_scores = NULL) { U <- rep(rep(row_scores, each = c), times = as.integer(t(X))) V <- rep(rep(col_scores, times = r), times = as.integer(t(X))) if (length(U) < 2 || stats::sd(U) == 0 || stats::sd(V) == 0) { - return(list(statistic = 0, p_value = 1, df = 1L, - n = as.integer(n_total), correlation = 0, - method = "Linear-by-linear association")) + return(list( + statistic = 0, p_value = 1, df = 1L, + n = as.integer(n_total), correlation = 0, + method = "Linear-by-linear association" + )) } rho <- stats::cor(U, V) M2 <- (n_total - 1) * rho^2 diff --git a/r-package/morie/R/ordlt.R b/r-package/morie/R/ordlt.R deleted file mode 100644 index 85fd1be57c..0000000000 --- a/r-package/morie/R/ordlt.R +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -#' Jonckheere-Terpstra ordered-alternatives test (Gibbons & Chakraborti Ch 10.6) -#' -#' R parity for ``morie.fn.ordlt.ordered_alternatives_test``. The -#' Python module also exports a proportional-odds ``ordered_logit`` -#' estimator (kept as a separate R callable in a future release). -#' -#' Computes the Jonckheere-Terpstra statistic J = sum over (i < j) of -#' Mann-Whitney U_ij for groups with a presumed monotonic ordering, -#' then standardises to z = (J - E_J) / sqrt(Var_J) and reports a -#' two-sided p-value via the normal approximation. -#' -#' @param groups A list of numeric vectors, one per ordered group. The -#' list order is the assumed direction of the alternative -#' hypothesis. -#' @return Named list with `statistic` (z), `p_value`, `J`, -#' `E_J`, `Var_J`, `n`, `method`. -#' @references Gibbons J. D. & Chakraborti S. (2014). Nonparametric -#' Statistical Inference (5th ed.), Ch. 10.6. -#' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } -#' @export -ordered_alternatives_test <- function(groups) { - if (!is.list(groups) || length(groups) < 2L) { - stop("`groups` must be a list of at least two numeric vectors") - } - k <- length(groups) - ns <- vapply(groups, length, integer(1L)) - N <- sum(ns) - if (any(ns < 1L)) stop("each group must be non-empty") - - # J = sum_{i 0) (J - EJ) / sqrt(VJ) else NA_real_ - p <- if (is.finite(z)) 2 * stats::pnorm(-abs(z)) else NA_real_ - - list( - statistic = z, - p_value = p, - J = J, - E_J = EJ, - Var_J = VJ, - n = N, - method = "Jonckheere-Terpstra ordered-alternatives (normal approx)" - ) -} diff --git a/r-package/morie/R/ordlt_jonckheere.R b/r-package/morie/R/ordlt_jonckheere.R index 5071d7558f..4f446133a8 100644 --- a/r-package/morie/R/ordlt_jonckheere.R +++ b/r-package/morie/R/ordlt_jonckheere.R @@ -14,29 +14,30 @@ #' @return Named list: statistic, p_value, z, E_J, Var_J, n, k, method. #' @importFrom stats pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_ordered_alternatives_test(groups = list(rnorm(20), rnorm(20), rnorm(20))) #' @export -ordered_alternatives_test <- function(groups) { - if (!is.list(groups) || length(groups) < 2) - return(list(statistic = NA_real_, p_value = NA_real_, z = NA_real_, - n = 0L, k = length(groups), - method = "Jonckheere-Terpstra ordered-alternatives test")) +morie_ordered_alternatives_test <- function(groups) { + if (!is.list(groups) || length(groups) < 2) { + return(list( + statistic = NA_real_, p_value = NA_real_, z = NA_real_, + n = 0L, k = length(groups), + method = "Jonckheere-Terpstra ordered-alternatives test" + )) + } arrs <- lapply(groups, as.numeric) k <- length(arrs) J <- 0 for (i in 1:(k - 1)) { for (j in (i + 1):k) { - ai <- arrs[[i]]; aj <- arrs[[j]] + ai <- arrs[[i]] + aj <- arrs[[j]] # Vectorise: for each ai count #(aj > ai) + 0.5 * #(aj == ai) lt <- sum(outer(ai, aj, "<")) eq <- sum(outer(ai, aj, "==")) J <- J + lt + 0.5 * eq } } - ns <- sapply(arrs, length) + ns <- vapply(arrs, length, integer(1)) N <- sum(ns) E_J <- (N^2 - sum(ns^2)) / 4 Var_J <- (N^2 * (2 * N + 3) - sum(ns^2 * (2 * ns + 3))) / 72 diff --git a/r-package/morie/R/paths.R b/r-package/morie/R/paths.R index f759bbf511..d7da3f76d7 100644 --- a/r-package/morie/R/paths.R +++ b/r-package/morie/R/paths.R @@ -1,7 +1,11 @@ # Internal infix helper for defaults. `%||%` <- function(x, y) { - if (is.null(x) || length(x) == 0) return(y) - if (length(x) == 1 && (is.na(x) || identical(x, ""))) return(y) + if (is.null(x) || length(x) == 0) { + return(y) + } + if (length(x) == 1 && (is.na(x) || identical(x, ""))) { + return(y) + } x } @@ -19,12 +23,11 @@ is_absolute_path <- function(path) { #' @param max_up Maximum number of parent traversals. #' @return Absolute path to the detected project root. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' tryCatch(morie_find_project_root(), +#' error = function(e) message("not inside a morie project tree") +#' ) #' @export -find_project_root <- function(start = getwd(), max_up = 10L) { +morie_find_project_root <- function(start = getwd(), max_up = 10L) { current <- normalizePath(start, winslash = "/", mustWork = FALSE) for (i in seq_len(max_up)) { @@ -52,13 +55,12 @@ find_project_root <- function(start = getwd(), max_up = 10L) { #' current working directory. #' @return Named list of key paths. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' tryCatch(morie_paths(), +#' error = function(e) message("not inside a morie project tree") +#' ) #' @export morie_paths <- function(project_root = NULL) { - root <- project_root %||% find_project_root() + root <- project_root %||% morie_find_project_root() root <- normalizePath(root, winslash = "/", mustWork = FALSE) list( diff --git a/r-package/morie/R/pcadm.R b/r-package/morie/R/pcadm.R index 3b109bc4fa..29e84c510e 100644 --- a/r-package/morie/R/pcadm.R +++ b/r-package/morie/R/pcadm.R @@ -11,15 +11,13 @@ #' explained_variance_ratio, singular_values, scores, n_components, #' n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_pca_dimension_reduction(x = rnorm(50)) #' @export -pca_dimension_reduction <- function(x, n_components = NULL, seed = 0L) { +morie_pca_dimension_reduction <- function(x, n_components = NULL, seed = 0L) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) - n <- nrow(x); p <- ncol(x) + n <- nrow(x) + p <- ncol(x) k <- if (is.null(n_components)) min(n, p) else n_components pc <- stats::prcomp(x, center = TRUE, scale. = FALSE, rank. = k) sv <- pc$sdev[seq_len(k)] @@ -33,7 +31,7 @@ pca_dimension_reduction <- function(x, n_components = NULL, seed = 0L) { components = components, explained_variance = as.numeric(ev), explained_variance_ratio = as.numeric(ratio), - singular_values = as.numeric(sv * sqrt(n - 1)), # match sklearn's S + singular_values = as.numeric(sv * sqrt(n - 1)), # match sklearn's S scores = scores, n_components = as.integer(k), n = n, diff --git a/r-package/morie/R/pctmr.R b/r-package/morie/R/pctmr.R index 5126af6f40..d4b488baef 100644 --- a/r-package/morie/R/pctmr.R +++ b/r-package/morie/R/pctmr.R @@ -11,32 +11,35 @@ #' @return Named list: statistic, p_value, z, n, m, q. #' @importFrom stats pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_percentile_modified_rank(x = rnorm(50), y = rnorm(50)) #' @export -percentile_modified_rank <- function(x, y, q = 0.25) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y); N <- m + n +morie_percentile_modified_rank <- function(x, y, q = 0.25) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) + N <- m + n if (m < 2 || n < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, z = NA_real_, - n = N, m = m, q = q, - method = "Percentile-modified rank test")) + return(list( + statistic = NA_real_, p_value = NA_real_, z = NA_real_, + n = N, m = m, q = q, + method = "Percentile-modified rank test" + )) } - if (!(q > 0 && q < 0.5)) + if (!(q > 0 && q < 0.5)) { stop("q must lie strictly between 0 and 0.5") + } pooled <- c(x, y) R <- rank(pooled) upper_cut <- (1 - q) * (N + 1) lower_cut <- q * (N + 1) a <- pmax(R - upper_cut, 0) - pmax(lower_cut - R, 0) - T <- sum(a[1:m]) + stat_t <- sum(a[1:m]) Var_T <- (m * n / (N * (N - 1))) * sum(a^2) - z <- T / sqrt(Var_T) + z <- stat_t / sqrt(Var_T) p <- 2 * (1 - stats::pnorm(abs(z))) list( - statistic = T, + statistic = stat_t, p_value = p, z = z, n = N, diff --git a/r-package/morie/R/penls.R b/r-package/morie/R/penls.R index e9de37a1e5..d6ef16b55c 100644 --- a/r-package/morie/R/penls.R +++ b/r-package/morie/R/penls.R @@ -15,62 +15,81 @@ #' @return list(estimate, beta, intercept, se, alpha, lam, n_iter, n, p, method). #' @references Friedman, Hastie & Tibshirani (2010); Montesinos Lopez Ch 6. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -penalized_regression <- function(x, y, alpha = 0.5, lam = 1.0, - max_iter = 1000, tol = 1e-6) { - X <- as.matrix(x); y <- as.numeric(y); n <- nrow(X); p <- ncol(X) +morie_penalized_regression <- function(x, y, alpha = 0.5, lam = 1.0, + max_iter = 1000, tol = 1e-6) { + x <- .morie_ensure_design_matrix(x) + X <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(X) + p <- ncol(X) use_glmnet <- requireNamespace("glmnet", quietly = TRUE) if (use_glmnet) { - fit <- glmnet::glmnet(X, y, alpha = alpha, lambda = lam, - standardize = TRUE, intercept = TRUE) + fit <- glmnet::glmnet(X, y, + alpha = alpha, lambda = lam, + standardize = TRUE, intercept = TRUE + ) beta <- as.numeric(fit$beta[, 1]) intercept <- as.numeric(fit$a0) y_hat <- as.numeric(X %*% beta) + intercept resid <- y - y_hat se <- sqrt(sum(resid^2) / max(n - p, 1)) - return(list(estimate = mean(abs(beta)), beta = beta, intercept = intercept, - y_hat = y_hat, se = se, alpha = alpha, lam = lam, - n_iter = NA_integer_, n = n, p = p, - method = "glmnet elastic-net")) + return(list( + estimate = mean(abs(beta)), beta = beta, intercept = intercept, + y_hat = y_hat, se = se, alpha = alpha, lam = lam, + n_iter = NA_integer_, n = n, p = p, + method = "glmnet elastic-net" + )) } - ym <- mean(y); yc <- y - ym - xm <- colMeans(X); xs <- apply(X, 2, stats::sd); xs[xs == 0] <- 1 + ym <- mean(y) + yc <- y - ym + xm <- colMeans(X) + xs <- apply(X, 2, stats::sd) + xs[xs == 0] <- 1 Xs <- sweep(sweep(X, 2, xm), 2, xs, "/") beta <- rep(0, p) xtx_diag <- colSums(Xs^2) / n r <- yc - as.numeric(Xs %*% beta) - soft <- lam * alpha; ridge_t <- lam * (1 - alpha) + soft <- lam * alpha + ridge_t <- lam * (1 - alpha) n_iter_done <- max_iter for (it in seq_len(max_iter)) { max_change <- 0 for (j in seq_len(p)) { r_j <- r + Xs[, j] * beta[j] z <- sum(Xs[, j] * r_j) / n - if (z > soft) new <- (z - soft) / (xtx_diag[j] + ridge_t) - else if (z < -soft) new <- (z + soft) / (xtx_diag[j] + ridge_t) - else new <- 0 + if (z > soft) { + new <- (z - soft) / (xtx_diag[j] + ridge_t) + } else if (z < -soft) { + new <- (z + soft) / (xtx_diag[j] + ridge_t) + } else { + new <- 0 + } change <- new - beta[j] if (abs(change) > max_change) max_change <- abs(change) beta[j] <- new r <- r_j - Xs[, j] * new } - if (max_change < tol) { n_iter_done <- it; break } + if (max_change < tol) { + n_iter_done <- it + break + } } beta_orig <- beta / xs intercept <- ym - sum(xm * beta_orig) y_hat <- as.numeric(X %*% beta_orig) + intercept resid <- y - y_hat se <- sqrt(sum(resid^2) / max(n - p, 1)) - list(estimate = mean(abs(beta_orig)), beta = beta_orig, - intercept = intercept, y_hat = y_hat, se = se, - alpha = alpha, lam = lam, n_iter = n_iter_done, - n = n, p = p, method = "Elastic-net coord descent (base R)") + list( + estimate = mean(abs(beta_orig)), beta = beta_orig, + intercept = intercept, y_hat = y_hat, se = se, + alpha = alpha, lam = lam, n_iter = n_iter_done, + n = n, p = p, method = "Elastic-net coord descent (base R)" + ) } # CANONICAL TEST # set.seed(10); X <- matrix(rnorm(120), 30, 4); b <- c(1,0,-1,0) -# y <- X %*% b + 0.1*rnorm(30); penalized_regression(X, y, alpha=1, lam=0.05)$beta +# y <- X %*% b + 0.1*rnorm(30); morie_penalized_regression(X, y, alpha=1, lam=0.05)$beta diff --git a/r-package/morie/R/permt.R b/r-package/morie/R/permt.R index 3180209e45..5f2a2c76ee 100644 --- a/r-package/morie/R/permt.R +++ b/r-package/morie/R/permt.R @@ -16,31 +16,42 @@ permt <- function(x, y, statistic = NULL, B = 5000L, alternative = c("two-sided", "less", "greater"), seed = 42L) { alternative <- match.arg(alternative) - x <- as.numeric(x); y <- as.numeric(y) - n_x <- length(x); n_y <- length(y) + x <- as.numeric(x) + y <- as.numeric(y) + n_x <- length(x) + n_y <- length(y) if (n_x < 1L || n_y < 1L) { - return(list(statistic = NA_real_, p_value = NA_real_, - n_x = n_x, n_y = n_y, method = "permt (empty)")) + return(list( + statistic = NA_real_, p_value = NA_real_, + n_x = n_x, n_y = n_y, method = "permt (empty)" + )) } - if (is.null(statistic)) + if (is.null(statistic)) { statistic <- function(a, b) mean(a) - mean(b) + } T_obs <- statistic(x, y) - pool <- c(x, y); m <- length(pool) + pool <- c(x, y) + m <- length(pool) set.seed(seed) T_perm <- numeric(B) for (b in seq_len(B)) { ord <- sample.int(m) - T_perm[b] <- statistic(pool[ord[seq_len(n_x)]], - pool[ord[(n_x + 1):m]]) + T_perm[b] <- statistic( + pool[ord[seq_len(n_x)]], + pool[ord[(n_x + 1):m]] + ) } p <- switch(alternative, - greater = (1 + sum(T_perm >= T_obs)) / (B + 1), - less = (1 + sum(T_perm <= T_obs)) / (B + 1), - `two-sided` = (1 + sum(abs(T_perm) >= abs(T_obs))) / (B + 1)) - list(statistic = as.numeric(T_obs), p_value = as.numeric(p), - n_x = as.integer(n_x), n_y = as.integer(n_y), - B = as.integer(B), alternative = alternative, - method = "Permutation test (Good 2005)") + greater = (1 + sum(T_perm >= T_obs)) / (B + 1), + less = (1 + sum(T_perm <= T_obs)) / (B + 1), + `two-sided` = (1 + sum(abs(T_perm) >= abs(T_obs))) / (B + 1) + ) + list( + statistic = as.numeric(T_obs), p_value = as.numeric(p), + n_x = as.integer(n_x), n_y = as.integer(n_y), + B = as.integer(B), alternative = alternative, + method = "Permutation test (Good 2005)" + ) } # CANONICAL TEST @@ -51,4 +62,4 @@ permt <- function(x, y, statistic = NULL, B = 5000L, #' @rdname permt #' @keywords internal #' @export -permutation_test_general <- permt +morie_permutation_test_general <- permt diff --git a/r-package/morie/R/perseus.R b/r-package/morie/R/perseus.R index 492d0c93bf..14f8e3ee53 100644 --- a/r-package/morie/R/perseus.R +++ b/r-package/morie/R/perseus.R @@ -4,12 +4,10 @@ #' @param context Optional context string. #' @return Character scalar prompt. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -build_prompt <- function(question, context = NULL) { +morie_build_prompt <- function(question, context = NULL) { question <- trimws(as.character(question)[1]) if (!nzchar(question)) { stop("`question` must be non-empty.", call. = FALSE) @@ -32,18 +30,16 @@ build_prompt <- function(question, context = NULL) { #' @param python_bin Python executable to use. Defaults to `MORIE_PYTHON_BIN` or `python3`. #' @return Agent text response. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -ask_percy <- function(question, context = NULL, python_bin = Sys.getenv("MORIE_PYTHON_BIN", "python3")) { - prompt <- build_prompt(question, context = context) +morie_ask_percy <- function(question, context = NULL, python_bin = Sys.getenv("MORIE_PYTHON_BIN", "python3")) { + prompt <- morie_build_prompt(question, context = context) code <- paste( "import json, sys", - "from morie.perseus import ask_percy", - "payload = ask_percy(question=sys.argv[1])", + "from morie.perseus import morie_ask_percy", + "payload = morie_ask_percy(question=sys.argv[1])", "print(payload['output_text'])", sep = "; " ) @@ -67,10 +63,10 @@ ask_percy <- function(question, context = NULL, python_bin = Sys.getenv("MORIE_P paste(out, collapse = "\n") } -#' @rdname build_prompt +#' @rdname morie_build_prompt #' @keywords internal -build_assistant_prompt <- build_prompt +build_assistant_prompt <- morie_build_prompt -#' @rdname ask_percy +#' @rdname morie_ask_percy #' @keywords internal -morie_assistant_query <- ask_percy +morie_assistant_query <- morie_ask_percy diff --git a/r-package/morie/R/plcmt.R b/r-package/morie/R/plcmt.R index 9772dfbbaf..af06ad9b76 100644 --- a/r-package/morie/R/plcmt.R +++ b/r-package/morie/R/plcmt.R @@ -8,18 +8,19 @@ #' @param x,y Numeric vectors. #' @return Named list: placements, ranks_y, U_y, E_U, Var_U, m, n. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rank_placements(x = rnorm(50), y = rnorm(50)) #' @export -rank_placements <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y) +morie_rank_placements <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) if (m < 1 || n < 1) { - return(list(placements = integer(0), ranks_y = numeric(0), - U_y = NA_real_, E_U = NA_real_, Var_U = NA_real_, - m = m, n = n, method = "Rank placements")) + return(list( + placements = integer(0), ranks_y = numeric(0), + U_y = NA_real_, E_U = NA_real_, Var_U = NA_real_, + m = m, n = n, method = "Rank placements" + )) } xs <- sort(x) placements <- as.integer(findInterval(y, xs, left.open = FALSE)) diff --git a/r-package/morie/R/polrg.R b/r-package/morie/R/polrg.R index 8ccbdd2e2f..b8f2054c8a 100644 --- a/r-package/morie/R/polrg.R +++ b/r-package/morie/R/polrg.R @@ -11,15 +11,14 @@ #' @param degree Polynomial degree. #' @return Named list: estimate, se, feature_names, degree, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_polynomial_regression(x = rnorm(50), y = rnorm(50)) #' @export -polynomial_regression <- function(x, y, degree = 2L) { +morie_polynomial_regression <- function(x, y, degree = 2L) { if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) - n <- nrow(x); p <- ncol(x) + x <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(x) + p <- ncol(x) cols <- list() names_ <- character() for (j in seq_len(p)) { @@ -32,14 +31,16 @@ polynomial_regression <- function(x, y, degree = 2L) { if (p > 1L && degree >= 2L) { combos <- utils::combn(p, 2) for (k in seq_len(ncol(combos))) { - i1 <- combos[1, k]; i2 <- combos[2, k] + i1 <- combos[1, k] + i2 <- combos[2, k] cols[[length(cols) + 1L]] <- x[, i1] * x[, i2] names_ <- c(names_, paste0("x", i1 - 1L, " x", i2 - 1L)) } } Xp <- do.call(cbind, cols) colnames(Xp) <- names_ - df <- as.data.frame(Xp); df$.y <- y + df <- as.data.frame(Xp) + df$.y <- y fit <- stats::lm(.y ~ ., data = df) s <- summary(fit) list( diff --git a/r-package/morie/R/polrz.R b/r-package/morie/R/polrz.R index ad8fb19846..235a2c4bda 100644 --- a/r-package/morie/R/polrz.R +++ b/r-package/morie/R/polrz.R @@ -11,40 +11,49 @@ #' @return Named list with `estimate`, `mean_R`, `mean_D`, `sd_R`, #' `sd_D`, `pooled_sd`, `n_R`, `n_D`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' polrz(x = rnorm(50)) #' @export polrz <- function(x, group = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) return(list(estimate = NA_real_, n = n, - method = "polarization_index")) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { + return(list( + estimate = NA_real_, n = n, + method = "morie_polarization_index" + )) + } if (is.null(group)) { - med <- stats::median(x); g <- as.integer(x >= med) + med <- stats::median(x) + g <- as.integer(x >= med) } else { if (length(group) != n) stop("group must have length(x)") uniq <- unique(group) if (length(uniq) != 2L) stop("group must have exactly 2 levels") g <- as.integer(group == uniq[2]) } - xR <- x[g == 1L]; xD <- x[g == 0L] - if (length(xR) < 1L || length(xD) < 1L) - return(list(estimate = NA_real_, n = n, method = "polarization_index")) - mR <- mean(xR); mD <- mean(xD) + xR <- x[g == 1L] + xD <- x[g == 0L] + if (length(xR) < 1L || length(xD) < 1L) { + return(list(estimate = NA_real_, n = n, method = "morie_polarization_index")) + } + mR <- mean(xR) + mD <- mean(xD) sR <- if (length(xR) > 1L) stats::sd(xR) else 0 sD <- if (length(xD) > 1L) stats::sd(xD) else 0 - nR <- length(xR); nD <- length(xD) + nR <- length(xR) + nD <- length(xD) pooled <- sqrt(((nR - 1) * sR^2 + (nD - 1) * sD^2) - / max(nR + nD - 2L, 1L)) + / max(nR + nD - 2L, 1L)) if (pooled <= 0) pooled <- if (n > 1L) stats::sd(x) else 0 pol <- if (pooled > 0) abs(mR - mD) / pooled else NA_real_ - list(estimate = pol, mean_R = mR, mean_D = mD, sd_R = sR, sd_D = sD, - pooled_sd = pooled, n_R = nR, n_D = nD, - method = "polarization_index") + list( + estimate = pol, mean_R = mR, mean_D = mD, sd_R = sR, sd_D = sD, + pooled_sd = pooled, n_R = nR, n_D = nD, + method = "morie_polarization_index" + ) } #' @keywords internal #' @rdname polrz #' @export -polarization_index <- polrz +morie_polarization_index <- polrz diff --git a/r-package/morie/R/posab.R b/r-package/morie/R/posab.R index 1d90052d49..c8a4ef2730 100644 --- a/r-package/morie/R/posab.R +++ b/r-package/morie/R/posab.R @@ -13,29 +13,31 @@ #' @return Named list \code{(PE, estimate, seq_len, d_model, method)}. #' @references Vaswani et al. (2017), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -posab_positional_encoding_abs <- function(seq_len, d_model, base = 10000) { - seq_len <- as.integer(seq_len); d_model <- as.integer(d_model) - if (seq_len <= 0 || d_model <= 0) +morie_posab_positional_encoding_abs <- function(seq_len, d_model, base = 10000) { + seq_len <- as.integer(seq_len) + d_model <- as.integer(d_model) + if (seq_len <= 0 || d_model <= 0) { stop("seq_len and d_model must be > 0.") + } pos <- matrix(seq_len(seq_len) - 1L, ncol = 1L) i <- matrix(0:(d_model - 1L), nrow = 1L) - div_term <- base ^ ((2 * (i %/% 2L)) / d_model) + div_term <- base^((2 * (i %/% 2L)) / d_model) angles <- pos[, rep(1L, d_model)] / div_term[rep(1L, seq_len), ] PE <- angles even <- seq(1L, d_model, by = 2L) odd <- seq(2L, d_model, by = 2L) PE[, even] <- sin(angles[, even, drop = FALSE]) - PE[, odd] <- cos(angles[, odd, drop = FALSE]) - list(PE = PE, estimate = PE, seq_len = seq_len, d_model = d_model, - method = "Sinusoidal positional encoding") + PE[, odd] <- cos(angles[, odd, drop = FALSE]) + list( + PE = PE, estimate = PE, seq_len = seq_len, d_model = d_model, + method = "Sinusoidal positional encoding" + ) } -#' @rdname posab_positional_encoding_abs +#' @rdname morie_posab_positional_encoding_abs #' @keywords internal #' @export -positional_encoding_abs <- posab_positional_encoding_abs +morie_positional_encoding_abs <- morie_posab_positional_encoding_abs diff --git a/r-package/morie/R/pplxm.R b/r-package/morie/R/pplxm.R index d28b274630..e07500f5d9 100644 --- a/r-package/morie/R/pplxm.R +++ b/r-package/morie/R/pplxm.R @@ -9,9 +9,13 @@ perplexity_metric <- function(x, base = "e") { logp <- as.numeric(x) if (!length(logp)) stop("Need at least one token log-prob") - if (identical(base, "2")) logp <- logp * log(2) - else if (!identical(base, "e")) stop("base must be 'e' or '2'") - nll <- -mean(logp); ppl <- exp(nll) - list(value = ppl, nll = nll, n = length(logp), - method = "perplexity") + if (identical(base, "2")) { + logp <- logp * log(2) + } else if (!identical(base, "e")) stop("base must be 'e' or '2'") + nll <- -mean(logp) + ppl <- exp(nll) + list( + value = ppl, nll = nll, n = length(logp), + method = "perplexity" + ) } diff --git a/r-package/morie/R/propc.R b/r-package/morie/R/propc.R index cf8ed1aa99..fb0d5ce4e3 100644 --- a/r-package/morie/R/propc.R +++ b/r-package/morie/R/propc.R @@ -7,27 +7,29 @@ #' @return Named list with \code{trend, seasonal, residual, slope, #' intercept, fourier_terms, period, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_prophet_components(x = rnorm(50)) #' @export -prophet_components <- function(x, period = 12) { - y <- as.numeric(x); n <- length(y) +morie_prophet_components <- function(x, period = 12) { + y <- as.numeric(x) + n <- length(y) if (n < max(2 * period, 6)) stop("Series too short.") t <- seq(0, n - 1) fit <- lm(y ~ t) - intercept <- coef(fit)[1]; slope <- coef(fit)[2] + intercept <- coef(fit)[1] + slope <- coef(fit)[2] trend <- fitted(fit) detr <- y - trend K <- 5 - Fmat <- do.call(cbind, lapply(seq_len(K), function(k) - cbind(sin(2 * pi * k * t / period), cos(2 * pi * k * t / period)))) + Fmat <- do.call(cbind, lapply(seq_len(K), function(k) { + cbind(sin(2 * pi * k * t / period), cos(2 * pi * k * t / period)) + })) fcoef <- lsfit(Fmat, detr, intercept = FALSE)$coef seasonal <- as.numeric(Fmat %*% fcoef) residual <- detr - seasonal - list(trend = trend, seasonal = seasonal, residual = residual, - slope = as.numeric(slope), intercept = as.numeric(intercept), - fourier_terms = fcoef, period = period, n = n, - method = "Prophet-style linear-trend + Fourier(K=5) seasonality") + list( + trend = trend, seasonal = seasonal, residual = residual, + slope = as.numeric(slope), intercept = as.numeric(intercept), + fourier_terms = fcoef, period = period, n = n, + method = "Prophet-style linear-trend + Fourier(K=5) seasonality" + ) } diff --git a/r-package/morie/R/pspln.R b/r-package/morie/R/pspln.R index 71d0d639d6..9a90197f0a 100644 --- a/r-package/morie/R/pspln.R +++ b/r-package/morie/R/pspln.R @@ -13,30 +13,40 @@ #' @importFrom splines bs #' @keywords internal pspln <- function(x, y, n_knots = 20L, degree = 3L, lam = 1) { - x <- as.numeric(x); y <- as.numeric(y); n <- length(x) - if (n < degree + 2L || length(y) != n) + x <- as.numeric(x) + y <- as.numeric(y) + n <- length(x) + if (n < degree + 2L || length(y) != n) { return(list(estimate = NA_real_, n = n, method = "P-spline (n too small)")) + } knots <- seq(min(x), max(x), length.out = n_knots) - B <- splines::bs(x, knots = knots[-c(1, length(knots))], - Boundary.knots = range(knots), - degree = degree, intercept = TRUE) - B <- as.matrix(B); k <- ncol(B) + B <- splines::bs(x, + knots = knots[-c(1, length(knots))], + Boundary.knots = range(knots), + degree = degree, intercept = TRUE + ) + B <- as.matrix(B) + k <- ncol(B) D <- diff(diag(k), differences = 2L) - BtB <- crossprod(B); BtY <- crossprod(B, y) + BtB <- crossprod(B) + BtY <- crossprod(B, y) coef <- as.numeric(solve(BtB + lam * crossprod(D), BtY)) fitted <- as.numeric(B %*% coef) resid <- y - fitted - sse <- sum(resid^2); sst <- sum((y - mean(y))^2) + sse <- sum(resid^2) + sst <- sum((y - mean(y))^2) r2 <- if (sst > 0) 1 - sse / sst else NA_real_ H <- B %*% solve(BtB + lam * crossprod(D), t(B)) edf <- sum(diag(H)) - list(coef = coef, fitted = fitted, residuals = resid, - sse = sse, r2 = as.numeric(r2), - edf = as.numeric(edf), lambda = lam, - estimate = mean(fitted), - se = sqrt(sse / max(1, n - edf)) / sqrt(n), - n = as.integer(n), - method = "P-spline (Eilers & Marx 1996)") + list( + coef = coef, fitted = fitted, residuals = resid, + sse = sse, r2 = as.numeric(r2), + edf = as.numeric(edf), lambda = lam, + estimate = mean(fitted), + se = sqrt(sse / max(1, n - edf)) / sqrt(n), + n = as.integer(n), + method = "P-spline (Eilers & Marx 1996)" + ) } # CANONICAL TEST @@ -47,4 +57,4 @@ pspln <- function(x, y, n_knots = 20L, degree = 3L, lam = 1) { #' @rdname pspln #' @keywords internal #' @export -penalized_spline <- pspln +morie_penalized_spline <- pspln diff --git a/r-package/morie/R/quntf.R b/r-package/morie/R/quntf.R index dc3d3ef46c..fc78c057d0 100644 --- a/r-package/morie/R/quntf.R +++ b/r-package/morie/R/quntf.R @@ -10,24 +10,33 @@ #' @return list: taus, quantiles, se, bandwidth, n, method. #' @keywords internal quntf <- function(x, taus = NULL) { - x <- as.numeric(x); n <- length(x) - if (n < 2L) + x <- as.numeric(x) + n <- length(x) + if (n < 2L) { return(list(estimate = NA_real_, n = n, method = "Quantile fn (n<2)")) + } if (is.null(taus)) taus <- c(0.10, 0.25, 0.50, 0.75, 0.90) q <- stats::quantile(x, probs = taus, names = FALSE, type = 7) sd_x <- stats::sd(x) iqr <- diff(stats::quantile(x, c(0.25, 0.75), names = FALSE)) - h <- if (iqr > 0) 1.06 * min(sd_x, iqr / 1.34) * n^(-0.2) - else 1.06 * sd_x * n^(-0.2) + h <- if (iqr > 0) { + 1.06 * min(sd_x, iqr / 1.34) * n^(-0.2) + } else { + 1.06 * sd_x * n^(-0.2) + } if (!is.finite(h) || h <= 0) h <- if (sd_x > 0) sd_x else 1 - fhat <- vapply(q, function(qi) mean(stats::dnorm(x, mean = qi, sd = h)), - numeric(1)) + fhat <- vapply( + q, function(qi) mean(stats::dnorm(x, mean = qi, sd = h)), + numeric(1) + ) se <- sqrt(taus * (1 - taus) / (n * fhat^2)) - list(taus = taus, quantiles = as.numeric(q), se = as.numeric(se), - bandwidth = as.numeric(h), - estimate = as.numeric(q[ceiling(length(q) / 2)]), - n = as.integer(n), - method = "Empirical quantile function (Parzen 1979)") + list( + taus = taus, quantiles = as.numeric(q), se = as.numeric(se), + bandwidth = as.numeric(h), + estimate = as.numeric(q[ceiling(length(q) / 2)]), + n = as.integer(n), + method = "Empirical quantile function (Parzen 1979)" + ) } # CANONICAL TEST @@ -38,4 +47,4 @@ quntf <- function(x, taus = NULL) { #' @rdname quntf #' @keywords internal #' @export -quantile_function <- quntf +morie_quantile_function <- quntf diff --git a/r-package/morie/R/rcall.R b/r-package/morie/R/rcall.R index fdfb62b134..c95c353e59 100644 --- a/r-package/morie/R/rcall.R +++ b/r-package/morie/R/rcall.R @@ -12,10 +12,7 @@ #' `marginal_yea`, `marginal_nay`, `pct_yea`, `lopsided_pct`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' rcall(x = rnorm(50)) #' @export rcall <- function(x) { V <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) @@ -26,7 +23,8 @@ rcall <- function(x) { Vp[V %in% c(4, 5, 6)] <- 0 V <- Vp } - n <- nrow(V); m <- ncol(V) + n <- nrow(V) + m <- ncol(V) n_yea <- sum(V == 1, na.rm = TRUE) n_nay <- sum(V == 0, na.rm = TRUE) n_abs <- sum(is.na(V)) @@ -35,14 +33,16 @@ rcall <- function(x) { denom <- marg_yea + marg_nay pct_yea <- ifelse(denom > 0, marg_yea / pmax(denom, 1L), NA_real_) lopsided <- mean((pct_yea >= 0.975) | (pct_yea <= 0.025), na.rm = TRUE) - list(n = n, m = m, n_yea = as.integer(n_yea), - n_nay = as.integer(n_nay), n_abs = as.integer(n_abs), - marginal_yea = marg_yea, marginal_nay = marg_nay, - pct_yea = pct_yea, lopsided_pct = as.numeric(lopsided), - method = "roll_call_analysis") + list( + n = n, m = m, n_yea = as.integer(n_yea), + n_nay = as.integer(n_nay), n_abs = as.integer(n_abs), + marginal_yea = marg_yea, marginal_nay = marg_nay, + pct_yea = pct_yea, lopsided_pct = as.numeric(lopsided), + method = "morie_roll_call_analysis" + ) } #' @keywords internal #' @rdname rcall #' @export -roll_call_analysis <- rcall +morie_roll_call_analysis <- rcall diff --git a/r-package/morie/R/regms.R b/r-package/morie/R/regms.R index a42a0c8577..6f0d441d0d 100644 --- a/r-package/morie/R/regms.R +++ b/r-package/morie/R/regms.R @@ -10,25 +10,32 @@ #' @return Named list with \code{mu, sigma, transition, #' smoothed_probabilities, loglik, n, k_regimes, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_regime_switching(x = rnorm(50)) #' @export -regime_switching <- function(x, k_regimes = 2) { - y <- as.numeric(x); n <- length(y) +morie_regime_switching <- function(x, k_regimes = 2) { + y <- as.numeric(x) + n <- length(y) if (n < 4 * k_regimes) stop("Series too short for K regimes.") if (requireNamespace("MSwM", quietly = TRUE)) { df <- data.frame(y = y) base_fit <- lm(y ~ 1, data = df) - msfit <- MSwM::msmFit(base_fit, k = k_regimes, sw = c(TRUE, TRUE)) - return(list(mu = as.numeric(msfit@Coef[, 1]), - sigma = as.numeric(msfit@std), - transition = msfit@transMat, - smoothed_probabilities = msfit@Fit@smoProb, - loglik = msfit@Fit@logLikel, - n = n, k_regimes = k_regimes, - method = sprintf("MSwM (K=%d)", k_regimes))) + # MSwM defaults parallelization = TRUE, which spawns 8 PSOCK workers and + # trips --as-cran's 2-worker cap (R_CHECK_LIMIT_CORES). Disable it: a + # 60-point regime-switching EM is far cheaper to run sequentially anyway. + msfit <- MSwM::msmFit( + base_fit, + k = k_regimes, sw = c(TRUE, TRUE), + control = list(parallelization = FALSE) + ) + return(list( + mu = as.numeric(msfit@Coef[, 1]), + sigma = as.numeric(msfit@std), + transition = msfit@transMat, + smoothed_probabilities = msfit@Fit@smoProb, + loglik = msfit@Fit@logLikel, + n = n, k_regimes = k_regimes, + method = sprintf("MSwM (K=%d)", k_regimes) + )) } mu <- seq(min(y), max(y), length.out = k_regimes) sig <- rep(max(sd(y), 1e-6), k_regimes) @@ -36,17 +43,23 @@ regime_switching <- function(x, k_regimes = 2) { pi <- rep(1 / k_regimes, k_regimes) ll_prev <- -Inf for (it in seq_len(200)) { - emit <- t(sapply(y, - function(yt) dnorm(yt, mean = mu, sd = sig))) + emit <- t(vapply( + y, + function(yt) dnorm(yt, mean = mu, sd = sig), numeric(length(mu)) + )) emit <- pmax(emit, 1e-300) - alpha <- matrix(0, n, k_regimes); cv <- numeric(n) - alpha[1, ] <- pi * emit[1, ]; cv[1] <- sum(alpha[1, ]) + alpha <- matrix(0, n, k_regimes) + cv <- numeric(n) + alpha[1, ] <- pi * emit[1, ] + cv[1] <- sum(alpha[1, ]) alpha[1, ] <- alpha[1, ] / cv[1] for (t in 2:n) { alpha[t, ] <- (alpha[t - 1, ] %*% P) * emit[t, ] - cv[t] <- sum(alpha[t, ]); alpha[t, ] <- alpha[t, ] / max(cv[t], 1e-300) + cv[t] <- sum(alpha[t, ]) + alpha[t, ] <- alpha[t, ] / max(cv[t], 1e-300) } - beta <- matrix(0, n, k_regimes); beta[n, ] <- 1 + beta <- matrix(0, n, k_regimes) + beta[n, ] <- 1 for (t in (n - 1):1) { beta[t, ] <- P %*% (emit[t + 1, ] * beta[t + 1, ]) beta[t, ] <- beta[t, ] / max(sum(beta[t, ]), 1e-300) @@ -70,9 +83,13 @@ regime_switching <- function(x, k_regimes = 2) { if (abs(ll - ll_prev) < 1e-6) break ll_prev <- ll } - list(mu = mu, sigma = sig, transition = P, - smoothed_probabilities = gamma, - loglik = ll_prev, n = n, k_regimes = k_regimes, - method = sprintf("Markov switching via EM/Hamilton filter (K=%d, base R)", - k_regimes)) + list( + mu = mu, sigma = sig, transition = P, + smoothed_probabilities = gamma, + loglik = ll_prev, n = n, k_regimes = k_regimes, + method = sprintf( + "Markov switching via EM/Hamilton filter (K=%d, base R)", + k_regimes + ) + ) } diff --git a/r-package/morie/R/retlv.R b/r-package/morie/R/retlv.R index 72d2841bbd..9ba68beb80 100644 --- a/r-package/morie/R/retlv.R +++ b/r-package/morie/R/retlv.R @@ -12,29 +12,36 @@ #' @keywords internal retlv <- function(x, return_period = 100) { fit <- extvm(x) - if (!is.finite(fit$mu %||% NA_real_)) + if (!is.finite(fit$mu %||% NA_real_)) { return(list(estimate = NA_real_, method = "Return level (GEV fit failed)")) - mu <- fit$mu; sigma <- fit$sigma; xi <- fit$xi + } + mu <- fit$mu + sigma <- fit$sigma + xi <- fit$xi p <- 1 / return_period yp <- -log(1 - p) if (abs(xi) < 1e-6) { z <- mu - sigma * log(yp) - d_mu <- 1; d_sig <- -log(yp); d_xi <- 0.5 * sigma * log(yp)^2 + d_mu <- 1 + d_sig <- -log(yp) + d_xi <- 0.5 * sigma * log(yp)^2 } else { z <- mu - (sigma / xi) * (1 - yp^(-xi)) d_mu <- 1 d_sig <- -(1 / xi) * (1 - yp^(-xi)) d_xi <- (sigma / xi^2) * (1 - yp^(-xi)) - - (sigma / xi) * yp^(-xi) * log(yp) + (sigma / xi) * yp^(-xi) * log(yp) } var_z <- (d_mu * fit$se_mu)^2 + (d_sig * fit$se_sigma)^2 + - (d_xi * fit$se_xi)^2 + (d_xi * fit$se_xi)^2 se <- sqrt(max(0, var_z)) - list(z = as.numeric(z), estimate = as.numeric(z), se = as.numeric(se), - return_period = as.numeric(return_period), - mu = mu, sigma = sigma, xi = xi, - n = length(x), - method = "Return level (Coles 2001)") + list( + z = as.numeric(z), estimate = as.numeric(z), se = as.numeric(se), + return_period = as.numeric(return_period), + mu = mu, sigma = sigma, xi = xi, + n = length(x), + method = "Return level (Coles 2001)" + ) } # small null-coalesce helper local to this file @@ -49,4 +56,4 @@ retlv <- function(x, return_period = 100) { #' @rdname retlv #' @keywords internal #' @export -return_level <- retlv +morie_return_level <- retlv diff --git a/r-package/morie/R/rfens.R b/r-package/morie/R/rfens.R index a74f418fa3..06b9309d28 100644 --- a/r-package/morie/R/rfens.R +++ b/r-package/morie/R/rfens.R @@ -19,23 +19,24 @@ #' n_estimators, task, n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_random_forest_ensemble(x = rnorm(50), y = rnorm(50)) #' @export -random_forest_ensemble <- function(x, y, n_estimators = 100L, - max_depth = NULL, task = "auto", - seed = 0L, - deterministic_seed = NULL) { +morie_random_forest_ensemble <- function(x, y, n_estimators = 100L, + max_depth = NULL, task = "auto", + seed = 0L, + deterministic_seed = NULL) { + x <- .morie_ensure_design_matrix(x) if (!requireNamespace("randomForest", quietly = TRUE)) { - stop("Function 'random_forest_ensemble' requires package 'randomForest'. Install with install.packages('randomForest').") + stop("Function 'morie_random_forest_ensemble' requires package 'randomForest'. Install with install.packages('randomForest').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) if (identical(task, "auto")) { - task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) - "classification" else "regression" + task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) { + "classification" + } else { + "regression" + } } y_use <- if (task == "classification") as.factor(y) else as.numeric(y) if (!is.null(deterministic_seed)) { diff --git a/r-package/morie/R/rfgen.R b/r-package/morie/R/rfgen.R index 4855c5ce38..3e6b2b4543 100644 --- a/r-package/morie/R/rfgen.R +++ b/r-package/morie/R/rfgen.R @@ -16,19 +16,23 @@ #' @return list(estimate, y_hat, oob_score, feature_importance, se, n, method). #' @references Breiman (2001); Montesinos Lopez Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_random_forest_genomic( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -random_forest_genomic <- function(x, y, markers, n_trees = 100, - max_depth = 10, min_samples = 2, - mtry = NULL, seed = 0) { +morie_random_forest_genomic <- function(x, y, markers, n_trees = 100, + max_depth = 10, min_samples = 2, + mtry = NULL, seed = 0) { set.seed(seed) - y <- as.numeric(y); n <- length(y) + y <- as.numeric(y) + n <- length(y) M <- as.matrix(markers) - feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) M - else cbind(as.matrix(x), M) + feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { + M + } else { + cbind(as.matrix(x), M) + } p <- ncol(feats) if (is.null(mtry)) mtry <- max(floor(sqrt(p)), 1L) if (requireNamespace("randomForest", quietly = TRUE)) { @@ -39,13 +43,14 @@ random_forest_genomic <- function(x, y, markers, n_trees = 100, y_hat <- as.numeric(stats::predict(rf, feats)) oob_pred <- rf$predicted oob <- as.numeric(1 - sum((y - oob_pred)^2) / - max(sum((y - mean(y))^2), 1e-12)) + max(sum((y - mean(y))^2), 1e-12)) imp <- as.numeric(randomForest::importance(rf)[, 1]) method_used <- "randomForest::randomForest" } else { method_used <- "base-R bagged regression-tree fallback" trees <- vector("list", n_trees) - oob_preds <- rep(0, n); oob_count <- rep(0, n) + oob_preds <- rep(0, n) + oob_count <- rep(0, n) for (b in seq_len(n_trees)) { boot <- sample.int(n, n, replace = TRUE) oob_mask <- !(seq_len(n) %in% boot) @@ -63,23 +68,29 @@ random_forest_genomic <- function(x, y, markers, n_trees = 100, oob_count[oob_count == 0] <- 1 oob_pred_avg <- oob_preds / oob_count oob <- as.numeric(1 - sum((y - oob_pred_avg)^2) / - max(sum((y - mean(y))^2), 1e-12)) - df_all <- data.frame(feats); names(df_all) <- names(tr_df)[-1] - yh_acc <- rep(0, n); cnt <- 0 - for (tr in trees) if (!is.null(tr)) { - yh_acc <- yh_acc + as.numeric(stats::predict(tr, df_all)) - cnt <- cnt + 1 + max(sum((y - mean(y))^2), 1e-12)) + df_all <- data.frame(feats) + names(df_all) <- names(tr_df)[-1] + yh_acc <- rep(0, n) + cnt <- 0 + for (tr in trees) { + if (!is.null(tr)) { + yh_acc <- yh_acc + as.numeric(stats::predict(tr, df_all)) + cnt <- cnt + 1 + } } y_hat <- yh_acc / max(cnt, 1) imp <- rep(NA_real_, p) } resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, oob_score = oob, - feature_importance = imp, se = sqrt(mean(resid^2)), - n = n, method = method_used) + list( + estimate = mean(y_hat), y_hat = y_hat, oob_score = oob, + feature_importance = imp, se = sqrt(mean(resid^2)), + n = n, method = method_used + ) } # CANONICAL TEST # set.seed(13); M <- matrix(rnorm(200), 40, 5) # y <- M[,1] + 0.5*M[,2]^2 + 0.2*rnorm(40) -# random_forest_genomic(rep(0,40), y, M, n_trees=20, seed=13) +# morie_random_forest_genomic(rep(0,40), y, M, n_trees=20, seed=13) diff --git a/r-package/morie/R/rgadp.R b/r-package/morie/R/rgadp.R index 33e60e8dd2..9118f4f038 100644 --- a/r-package/morie/R/rgadp.R +++ b/r-package/morie/R/rgadp.R @@ -11,24 +11,31 @@ #' @references Widrow & Stearns (1985); Rangayyan Ch 11. #' @export #' @examples -#' set.seed(0); n <- rnorm(200) +#' set.seed(0) +#' n <- rnorm(200) #' x <- sin(2 * pi * seq_len(200) / 20) + n -#' r <- rgadp(x, reference = n, mu = 0.01, order = 8); length(r$signal) +#' r <- rgadp(x, reference = n, mu = 0.01, order = 8) +#' length(r$signal) rgadp <- function(x, reference, mu = 0.01, order = 16L) { if (length(x) != length(reference)) stop("x and reference must have equal length.") - M <- as.integer(order); N <- length(x) - w <- numeric(M); y <- numeric(N); e <- numeric(N) + M <- as.integer(order) + N <- length(x) + w <- numeric(M) + y <- numeric(N) + e <- numeric(N) for (n in M:N) { rv <- reference[(n - M + 1):n][seq.int(M, 1L)] y[n] <- sum(w * rv) e[n] <- x[n] - y[n] w <- w + 2 * mu * e[n] * rv } - list(signal = e, noise_estimate = y, weights = w, - mu = mu, order = M) + list( + signal = e, noise_estimate = y, weights = w, + mu = mu, order = M + ) } #' @rdname rgadp #' @keywords internal #' @export -rangayyan_adaptive_filter <- rgadp +morie_rangayyan_adaptive_filter <- rgadp diff --git a/r-package/morie/R/rgapn.R b/r-package/morie/R/rgapn.R index a595ed9eb6..9e3ed3fb44 100644 --- a/r-package/morie/R/rgapn.R +++ b/r-package/morie/R/rgapn.R @@ -13,7 +13,8 @@ #' @references Pincus (1991), PNAS 88:2297. #' @export #' @examples -#' set.seed(0); rgapn(rnorm(100), m = 2)$ApEn +#' set.seed(0) +#' rgapn(rnorm(100), m = 2)$ApEn rgapn <- function(x, m = 2L, r = NULL) { N <- length(x) if (is.null(r)) r <- 0.2 * stats::sd(x) @@ -31,11 +32,12 @@ rgapn <- function(x, m = 2L, r = NULL) { C <- pmax(C, 1e-30) mean(log(C)) } - pm <- phi(m); pm1 <- phi(m + 1L) + pm <- phi(m) + pm1 <- phi(m + 1L) list(ApEn = pm - pm1, phi_m = pm, phi_m1 = pm1, m = m, r = r, n = N) } #' @rdname rgapn #' @keywords internal #' @export -rangayyan_approximate_entropy <- rgapn +morie_rangayyan_approximate_entropy <- rgapn diff --git a/r-package/morie/R/rgarb.R b/r-package/morie/R/rgarb.R index c43778df95..7c81daa2af 100644 --- a/r-package/morie/R/rgarb.R +++ b/r-package/morie/R/rgarb.R @@ -12,11 +12,15 @@ #' @references Burg (1975); Marple (1987); Rangayyan Ch 4. #' @export #' @examples -#' set.seed(0); r <- rgarb(rnorm(500), order = 4); length(r$ar_coeffs) +#' set.seed(0) +#' r <- rgarb(rnorm(500), order = 4) +#' length(r$ar_coeffs) rgarb <- function(x, order = 10L) { - N <- length(x); p <- as.integer(order) + N <- length(x) + p <- as.integer(order) if (p < 1 || p >= N) stop("order must be 1 <= p < length(x).") - f <- as.numeric(x); b <- as.numeric(x) + f <- as.numeric(x) + b <- as.numeric(x) a <- c(1, rep(0, p)) var_ <- mean(x^2) k <- numeric(p) @@ -42,4 +46,4 @@ rgarb <- function(x, order = 10L) { #' @rdname rgarb #' @keywords internal #' @export -rangayyan_ar_burg <- rgarb +morie_rangayyan_ar_burg <- rgarb diff --git a/r-package/morie/R/rgcoh.R b/r-package/morie/R/rgcoh.R index a61059cea2..3aae143bf8 100644 --- a/r-package/morie/R/rgcoh.R +++ b/r-package/morie/R/rgcoh.R @@ -1,6 +1,6 @@ -#' Magnitude-squared coherence -- Rangayyan Ch 4 +#' Magnitude-squared morie_coherence -- Rangayyan Ch 4 #' -#' Welch-averaged magnitude-squared coherence:: +#' Welch-averaged magnitude-squared morie_coherence:: #' #' \deqn{C_{xy}(f) = |S_{xy}(f)|^2 / (S_{xx}(f) S_{yy}(f))} #' @@ -9,15 +9,18 @@ #' @param x,y Numeric vectors of equal length. #' @param fs Sampling rate (Hz). #' @param nperseg Welch segment length (default `min(N, 256)`). -#' @return Named list `freqs`, `coherence`, `mean_coherence`, +#' @return Named list `freqs`, `morie_coherence`, `mean_coherence`, #' `peak_freq`, `peak_coherence`. #' @references Rangayyan Ch 4. #' @export #' @examples #' \donttest{ -#' set.seed(0); fs <- 100; t <- seq(0, 10, length.out = 1024) -#' a <- sin(2 * pi * 10 * t); b <- a + 0.1 * rnorm(length(t)) -#' rgcoh(a, b, fs = fs)$peak_coherence > 0.5 +#' set.seed(0) +#' fs <- 100 +#' t <- seq(0, 10, length.out = 1024) +#' a <- sin(2 * pi * 10 * t) +#' b <- a + 0.1 * rnorm(length(t)) +#' rgcoh(a, b, fs = fs)$peak_coherence > 0.5 #' } rgcoh <- function(x, y, fs = 1.0, nperseg = NULL) { N <- length(x) @@ -31,7 +34,8 @@ rgcoh <- function(x, y, fs = 1.0, nperseg = NULL) { W <- sum(w^2) nf <- nperseg %/% 2L + 1L freqs <- seq(0, fs / 2, length.out = nf) - Sxx <- Syy <- numeric(nf); Sxy <- complex(nf) + Sxx <- Syy <- numeric(nf) + Sxy <- complex(nf) for (s in starts) { sx <- (x[s:(s + nperseg - 1)] - mean(x[s:(s + nperseg - 1)])) * w sy <- (y[s:(s + nperseg - 1)] - mean(y[s:(s + nperseg - 1)])) * w @@ -43,13 +47,15 @@ rgcoh <- function(x, y, fs = 1.0, nperseg = NULL) { } Cxy <- Mod(Sxy)^2 / (Sxx * Syy) peak <- which.max(Cxy) - list(freqs = freqs, coherence = Cxy, - mean_coherence = mean(Cxy), - peak_freq = freqs[peak], - peak_coherence = Cxy[peak]) + list( + freqs = freqs, morie_coherence = Cxy, + mean_coherence = mean(Cxy), + peak_freq = freqs[peak], + peak_coherence = Cxy[peak] + ) } #' @rdname rgcoh #' @keywords internal #' @export -rangayyan_coherence <- rgcoh +morie_rangayyan_coherence <- rgcoh diff --git a/r-package/morie/R/rgcrl.R b/r-package/morie/R/rgcrl.R index 1c328d811d..b107c23d22 100644 --- a/r-package/morie/R/rgcrl.R +++ b/r-package/morie/R/rgcrl.R @@ -10,7 +10,8 @@ #' @references Grassberger & Procaccia (1983), Physica D 9:189. #' @export #' @examples -#' set.seed(0); rgcrl(rnorm(200), m = 3, tau = 1, n_r = 15)$D2 +#' set.seed(0) +#' rgcrl(rnorm(200), m = 3, tau = 1, n_r = 15)$D2 rgcrl <- function(x, m = 3L, tau = 1L, n_r = 20L) { N <- length(x) M <- N - (m - 1L) * tau @@ -25,11 +26,12 @@ rgcrl <- function(x, m = 3L, tau = 1L, n_r = 20L) { rs <- 10^seq(log10(rmin), log10(rmax), length.out = n_r) C <- vapply(rs, function(r) mean(dist <= r), numeric(1)) mask <- C > 0 & is.finite(C) - log_r <- log(rs[mask]); log_C <- log(C[mask]) + log_r <- log(rs[mask]) + log_C <- log(C[mask]) if (length(log_r) < 3) { D2 <- NA_real_ } else { - n <- length(log_r) + n <- length(log_r) lo <- max(1L, n %/% 5L) hi <- max(lo + 2L, n - n %/% 5L) D2 <- unname(stats::coef(stats::lm(log_C[lo:hi] ~ log_r[lo:hi]))[2]) @@ -40,4 +42,4 @@ rgcrl <- function(x, m = 3L, tau = 1L, n_r = 20L) { #' @rdname rgcrl #' @keywords internal #' @export -rangayyan_correlation_dimension <- rgcrl +morie_rangayyan_correlation_dimension <- rgcrl diff --git a/r-package/morie/R/rgdfa.R b/r-package/morie/R/rgdfa.R index e339c51054..40cdf8fe88 100644 --- a/r-package/morie/R/rgdfa.R +++ b/r-package/morie/R/rgdfa.R @@ -5,11 +5,12 @@ #' @param x Numeric vector. #' @param scales Optional integer box sizes (default geometric 4..N/4). #' @param order Detrending polynomial order (default 1 = DFA-1). -#' @return Named list `alpha`, `scales`, `F`, `log_scales`, `log_F`. +#' @return Named list `alpha`, `scales`, `fluct`, `log_scales`, `log_F`. #' @references Peng et al. (1994), Phys Rev E 49:1685; Rangayyan Ch 7. #' @export #' @examples -#' set.seed(0); rgdfa(rnorm(500))$alpha +#' set.seed(0) +#' rgdfa(rnorm(500))$alpha rgdfa <- function(x, scales = NULL, order = 1L) { N <- length(x) if (N < 32) stop("DFA needs at least 32 samples.") @@ -20,11 +21,14 @@ rgdfa <- function(x, scales = NULL, order = 1L) { } scales <- as.integer(scales) y <- cumsum(x - mean(x)) - F <- numeric(length(scales)) + fluct <- numeric(length(scales)) for (j in seq_along(scales)) { n <- scales[j] nseg <- N %/% n - if (nseg < 1) { F[j] <- NA_real_; next } + if (nseg < 1) { + fluct[j] <- NA_real_ + next + } rms <- numeric(nseg) for (k in seq_len(nseg)) { seg <- y[((k - 1) * n + 1):(k * n)] @@ -32,17 +36,20 @@ rgdfa <- function(x, scales = NULL, order = 1L) { p <- stats::lm(seg ~ stats::poly(t_, order, raw = TRUE)) rms[k] <- mean(stats::residuals(p)^2) } - F[j] <- sqrt(mean(rms)) + fluct[j] <- sqrt(mean(rms)) } - mask <- is.finite(F) & F > 0 - log_n <- log(scales[mask]); log_F <- log(F[mask]) + mask <- is.finite(fluct) & fluct > 0 + log_n <- log(scales[mask]) + log_F <- log(fluct[mask]) fit <- stats::lm(log_F ~ log_n) - list(alpha = unname(stats::coef(fit)[2]), - scales = scales, F = F, - log_scales = log_n, log_F = log_F) + list( + alpha = unname(stats::coef(fit)[2]), + scales = scales, fluct = fluct, + log_scales = log_n, log_F = log_F + ) } #' @rdname rgdfa #' @keywords internal #' @export -rangayyan_dfa <- rgdfa +morie_rangayyan_dfa <- rgdfa diff --git a/r-package/morie/R/rgeeg.R b/r-package/morie/R/rgeeg.R index aa623882ce..b1c5597384 100644 --- a/r-package/morie/R/rgeeg.R +++ b/r-package/morie/R/rgeeg.R @@ -16,20 +16,25 @@ #' @export #' @examples #' \donttest{ -#' set.seed(0); fs <- 256 -#' t <- seq(0, 8, length.out = 2048) -#' x <- sin(2 * pi * 10 * t) + 0.3 * rnorm(length(t)) -#' r <- rgeeg(x, fs = fs); r$relative[["alpha"]] > r$relative[["gamma"]] +#' set.seed(0) +#' fs <- 256 +#' t <- seq(0, 8, length.out = 2048) +#' x <- sin(2 * pi * 10 * t) + 0.3 * rnorm(length(t)) +#' r <- rgeeg(x, fs = fs) +#' r$relative[["alpha"]] > r$relative[["gamma"]] #' } rgeeg <- function(x, fs, bands = NULL, nperseg = NULL) { if (is.null(bands)) { - bands <- list(delta = c(0.5, 4), theta = c(4, 8), - alpha = c(8, 13), beta = c(13, 30), - gamma = c(30, 100)) + bands <- list( + delta = c(0.5, 4), theta = c(4, 8), + alpha = c(8, 13), beta = c(13, 30), + gamma = c(30, 100) + ) } if (is.null(nperseg)) nperseg <- max(16, min(length(x), as.integer(4 * fs))) ps <- rgpsd(x, fs = fs, nperseg = nperseg) - freqs <- ps$freqs; psd <- ps$psd + freqs <- ps$freqs + psd <- ps$psd df <- diff(freqs[1:2]) total <- sum(psd) * df absolute <- vapply(names(bands), function(nm) { @@ -39,11 +44,13 @@ rgeeg <- function(x, fs, bands = NULL, nperseg = NULL) { }, numeric(1)) names(absolute) <- names(bands) relative <- if (total > 0) absolute / total else absolute * 0 - list(absolute = absolute, relative = relative, - total_power = total, freqs = freqs, psd = psd) + list( + absolute = absolute, relative = relative, + total_power = total, freqs = freqs, psd = psd + ) } #' @rdname rgeeg #' @keywords internal #' @export -rangayyan_eeg_bands <- rgeeg +morie_rangayyan_eeg_bands <- rgeeg diff --git a/r-package/morie/R/rgemg.R b/r-package/morie/R/rgemg.R index 3761e4440d..1aef91a42d 100644 --- a/r-package/morie/R/rgemg.R +++ b/r-package/morie/R/rgemg.R @@ -9,7 +9,9 @@ #' @references Rangayyan Ch 8. #' @export #' @examples -#' set.seed(0); r <- rgemg(rnorm(500), window = 32); length(r$rms) +#' set.seed(0) +#' r <- rgemg(rnorm(500), window = 32) +#' length(r$rms) rgemg <- function(x, window = 64L, fs = 1.0) { W <- as.integer(window) if (W < 1) stop("window must be >= 1") @@ -25,4 +27,4 @@ rgemg <- function(x, window = 64L, fs = 1.0) { #' @rdname rgemg #' @keywords internal #' @export -rangayyan_emg_rms <- rgemg +morie_rangayyan_emg_rms <- rgemg diff --git a/r-package/morie/R/rgenv.R b/r-package/morie/R/rgenv.R index e5fdd88142..a1b025b60a 100644 --- a/r-package/morie/R/rgenv.R +++ b/r-package/morie/R/rgenv.R @@ -15,13 +15,15 @@ #' @examples #' t <- seq(0, 1, length.out = 200) #' x <- cos(2 * pi * 5 * t) * (1 + 0.3 * cos(2 * pi * 0.5 * t)) -#' r <- rgenv(x); length(r$envelope) +#' r <- rgenv(x) +#' length(r$envelope) rgenv <- function(x) { N <- length(x) X <- stats::fft(x) h <- numeric(N) if (N %% 2 == 0) { - h[1] <- 1; h[N / 2 + 1] <- 1 + h[1] <- 1 + h[N / 2 + 1] <- 1 h[2:(N / 2)] <- 2 } else { h[1] <- 1 @@ -35,12 +37,14 @@ rgenv <- function(x) { dphi <- ((dphi + pi) %% (2 * pi)) - pi phase_unwrapped <- cumsum(c(phase[1], dphi)) inst_freq <- diff(phase_unwrapped) / (2 * pi) - list(envelope = env, analytic = z, - instantaneous_phase = phase_unwrapped, - instantaneous_freq = inst_freq) + list( + envelope = env, analytic = z, + instantaneous_phase = phase_unwrapped, + instantaneous_freq = inst_freq + ) } #' @rdname rgenv #' @keywords internal #' @export -rangayyan_envelope <- rgenv +morie_rangayyan_envelope <- rgenv diff --git a/r-package/morie/R/rgfir.R b/r-package/morie/R/rgfir.R index 0587bfcca7..78a69972f6 100644 --- a/r-package/morie/R/rgfir.R +++ b/r-package/morie/R/rgfir.R @@ -29,23 +29,26 @@ rgfir <- function(x, cutoff, order = 51L, fs = 1.0, window = "hamming") { if (order < 3L) order <- 3L if (order %% 2L == 0L) order <- order + 1L nyq <- 0.5 * fs - fc <- min(max(cutoff / nyq, 1e-6), 1 - 1e-6) + fc <- min(max(cutoff / nyq, 1e-6), 1 - 1e-6) if (requireNamespace("signal", quietly = TRUE)) { win_fn <- switch(window, hamming = signal::hamming(order), hann = signal::hanning(order), blackman = signal::blackman(order), rectangular = rep(1, order), - signal::hamming(order)) + signal::hamming(order) + ) taps <- signal::fir1(order - 1L, fc, type = "low", window = win_fn) padlen <- 3L * order if (length(x) > padlen) { - y <- as.numeric(signal::filtfilt(taps, 1, x)) + y <- as.numeric(signal::filtfilt(taps, x)) } else { - y <- as.numeric(signal::filter(taps, 1, x)) + y <- as.numeric(signal::filter(taps, x)) } - return(list(signal = y, taps = taps, order = order, - cutoff = cutoff, fs = fs, window = window)) + return(list( + signal = y, taps = taps, order = order, + cutoff = cutoff, fs = fs, window = window + )) } stop("R package 'signal' is required for rgfir().") } @@ -53,4 +56,4 @@ rgfir <- function(x, cutoff, order = 51L, fs = 1.0, window = "hamming") { #' @rdname rgfir #' @keywords internal #' @export -rangayyan_fir_filter <- rgfir +morie_rangayyan_fir_filter <- rgfir diff --git a/r-package/morie/R/rghfd.R b/r-package/morie/R/rghfd.R index 1c2606a9de..2b8213fc4a 100644 --- a/r-package/morie/R/rghfd.R +++ b/r-package/morie/R/rghfd.R @@ -8,7 +8,8 @@ #' @references Higuchi (1988), Physica D 31:277. Rangayyan Ch 7. #' @export #' @examples -#' set.seed(0); rghfd(rnorm(500), kmax = 8)$HFD +#' set.seed(0) +#' rghfd(rnorm(500), kmax = 8)$HFD rghfd <- function(x, kmax = 10L) { N <- length(x) if (N < 4 || kmax < 2) stop("Need length(x) >= 4 and kmax >= 2.") @@ -20,20 +21,23 @@ rghfd <- function(x, kmax = 10L) { idx <- seq(m + 1L, N, by = k) if (length(idx) < 2) next diffs <- sum(abs(diff(x[idx]))) - norm <- (N - 1) / (k * floor((N - m) / k)) + norm <- (N - 1) / (k * floor((N - m) / k)) lk <- c(lk, (diffs / k) * norm) } L[k] <- if (length(lk)) mean(lk) else NA_real_ } ks <- seq_len(kmax) - log_L <- log(L); log_inv_k <- log(1 / ks) + log_L <- log(L) + log_inv_k <- log(1 / ks) fit <- stats::lm(log_L ~ log_inv_k) - list(HFD = unname(stats::coef(fit)[2]), - intercept = unname(stats::coef(fit)[1]), - log_L = log_L, log_inv_k = log_inv_k, kmax = kmax) + list( + HFD = unname(stats::coef(fit)[2]), + intercept = unname(stats::coef(fit)[1]), + log_L = log_L, log_inv_k = log_inv_k, kmax = kmax + ) } #' @rdname rghfd #' @keywords internal #' @export -rangayyan_higuchi_fd <- rghfd +morie_rangayyan_higuchi_fd <- rghfd diff --git a/r-package/morie/R/rghrv.R b/r-package/morie/R/rghrv.R index 5d6a2afa7e..fed8a40327 100644 --- a/r-package/morie/R/rghrv.R +++ b/r-package/morie/R/rghrv.R @@ -16,7 +16,9 @@ #' @references Task Force (1996), Circulation 93:1043. Rangayyan Ch 6. #' @export #' @examples -#' set.seed(0); rgh <- rghrv(800 + rnorm(200, sd = 40)); rgh$heart_rate_bpm +#' set.seed(0) +#' rgh <- rghrv(800 + rnorm(200, sd = 40)) +#' rgh$heart_rate_bpm rghrv <- function(rr_ms) { rr <- as.numeric(rr_ms) n <- length(rr) @@ -27,11 +29,13 @@ rghrv <- function(rr_ms) { rmssd <- sqrt(mean(d^2)) pnn50 <- 100 * mean(abs(d) > 50) hr <- if (mean_nn > 0) 60000 / mean_nn else NA_real_ - list(meanNN = mean_nn, SDNN = sdnn, RMSSD = rmssd, pNN50 = pnn50, - heart_rate_bpm = hr, n = n) + list( + meanNN = mean_nn, SDNN = sdnn, RMSSD = rmssd, pNN50 = pnn50, + heart_rate_bpm = hr, n = n + ) } #' @rdname rghrv #' @keywords internal #' @export -rangayyan_hrv <- rghrv +morie_rangayyan_hrv <- rghrv diff --git a/r-package/morie/R/rgiir.R b/r-package/morie/R/rgiir.R index 997ebe3b22..44dcddecfb 100644 --- a/r-package/morie/R/rgiir.R +++ b/r-package/morie/R/rgiir.R @@ -29,14 +29,16 @@ rgiir <- function(x, cutoff, order = 4L, fs = 1.0, btype = c("low", "high", "pas stop("R package 'signal' is required for rgiir().") } nyq <- 0.5 * fs - wn <- cutoff / nyq + wn <- cutoff / nyq bf <- signal::butter(as.integer(order), wn, type = btype) - y <- as.numeric(signal::filtfilt(bf, x)) - list(signal = y, order = as.integer(order), cutoff = cutoff, - fs = fs, btype = btype) + y <- as.numeric(signal::filtfilt(bf, x)) + list( + signal = y, order = as.integer(order), cutoff = cutoff, + fs = fs, btype = btype + ) } #' @rdname rgiir #' @keywords internal #' @export -rangayyan_iir_filter <- rgiir +morie_rangayyan_iir_filter <- rgiir diff --git a/r-package/morie/R/rglyp.R b/r-package/morie/R/rglyp.R index aa52daf4eb..f49740feb8 100644 --- a/r-package/morie/R/rglyp.R +++ b/r-package/morie/R/rglyp.R @@ -12,7 +12,8 @@ #' @references Rosenstein et al. (1993), Physica D 65:117. #' @export #' @examples -#' set.seed(0); rglyp(rnorm(200), m = 3, tau = 1, max_t = 20)$lyapunov +#' set.seed(0) +#' rglyp(rnorm(200), m = 3, tau = 1, max_t = 20)$lyapunov rglyp <- function(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) { N <- length(x) M <- N - (m - 1L) * tau @@ -31,7 +32,7 @@ rglyp <- function(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) { ok <- (iv + t0 <= M) & (nn + t0 <= M) if (!any(ok)) next diffs <- sqrt(rowSums((Y[iv[ok] + t0, , drop = FALSE] - - Y[nn[ok] + t0, , drop = FALSE])^2)) + Y[nn[ok] + t0, , drop = FALSE])^2)) diffs <- diffs[diffs > 0] if (length(diffs)) div[t] <- mean(log(diffs)) } @@ -42,11 +43,13 @@ rglyp <- function(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) { half <- max(3L, length(ts) %/% 2L) lam <- stats::coef(stats::lm(div[ts[seq_len(half)]] ~ ts[seq_len(half)]))[2] } - list(lyapunov = unname(lam), divergence_curve = div, - t = seq_len(max_t)) + list( + lyapunov = unname(lam), divergence_curve = div, + t = seq_len(max_t) + ) } #' @rdname rglyp #' @keywords internal #' @export -rangayyan_lyapunov <- rglyp +morie_rangayyan_lyapunov <- rglyp diff --git a/r-package/morie/R/rgpsd.R b/r-package/morie/R/rgpsd.R index 9df46bdc4e..bd10909632 100644 --- a/r-package/morie/R/rgpsd.R +++ b/r-package/morie/R/rgpsd.R @@ -14,7 +14,8 @@ #' @export #' @examples #' \donttest{ -#' set.seed(0); fs <- 100 +#' set.seed(0) +#' fs <- 100 #' t <- seq(0, 10, length.out = 1000) #' x <- sin(2 * pi * 10 * t) #' r <- rgpsd(x, fs = fs, nperseg = 256) @@ -30,17 +31,18 @@ rgpsd <- function(x, fs = 1.0, nperseg = NULL, window = "hann") { starts <- seq(1, N - nperseg + 1, by = step) if (length(starts) < 1) starts <- 1 w <- switch(window, - hann = 0.5 - 0.5 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), - hamming = 0.54 - 0.46 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), - boxcar = rep(1, nperseg), - rep(1, nperseg)) + hann = 0.5 - 0.5 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), + hamming = 0.54 - 0.46 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), + boxcar = rep(1, nperseg), + rep(1, nperseg) + ) W <- sum(w^2) freqs <- seq(0, fs / 2, length.out = nperseg %/% 2L + 1L) psd_acc <- numeric(length(freqs)) for (s in starts) { seg <- x[s:(s + nperseg - 1)] - mean(x[s:(s + nperseg - 1)]) seg <- seg * w - X <- stats::fft(seg)[1:length(freqs)] + X <- stats::fft(seg)[seq_along(freqs)] pxx <- (Mod(X)^2) / (fs * W) # one-sided scaling: double interior bins if (length(pxx) > 2) pxx[2:(length(pxx) - 1)] <- 2 * pxx[2:(length(pxx) - 1)] @@ -49,11 +51,13 @@ rgpsd <- function(x, fs = 1.0, nperseg = NULL, window = "hann") { psd <- psd_acc / length(starts) peak <- which.max(psd) total <- sum(psd) * (freqs[2] - freqs[1]) - list(freqs = freqs, psd = psd, fs = fs, nperseg = nperseg, - peak_freq = freqs[peak], total_power = total) + list( + freqs = freqs, psd = psd, fs = fs, nperseg = nperseg, + peak_freq = freqs[peak], total_power = total + ) } #' @rdname rgpsd #' @keywords internal #' @export -rangayyan_psd <- rgpsd +morie_rangayyan_psd <- rgpsd diff --git a/r-package/morie/R/rgqrs.R b/r-package/morie/R/rgqrs.R index ba5bd8597a..e77a98f074 100644 --- a/r-package/morie/R/rgqrs.R +++ b/r-package/morie/R/rgqrs.R @@ -12,12 +12,15 @@ #' @export #' @examples #' \donttest{ -#' if (requireNamespace("signal", quietly = TRUE)) { -#' fs <- 360; t <- seq(0, 5, length.out = 5 * fs) -#' ecg <- rowSums(sapply(seq(0.5, 4.5, by = 1.0), -#' function(tk) exp(-((t - tk) * 30)^2))) -#' rgqrs(ecg, fs = fs)$r_peaks -#' } +#' if (requireNamespace("signal", quietly = TRUE)) { +#' fs <- 360 +#' t <- seq(0, 5, length.out = 5 * fs) +#' ecg <- rowSums(sapply( +#' seq(0.5, 4.5, by = 1.0), +#' function(tk) exp(-((t - tk) * 30)^2) +#' )) +#' rgqrs(ecg, fs = fs)$r_peaks +#' } #' } rgqrs <- function(x, fs = 360.0) { if (!requireNamespace("signal", quietly = TRUE)) { @@ -52,16 +55,19 @@ rgqrs <- function(x, fs = 360.0) { # refine each to local |bp| max within +/-50 ms half <- as.integer(round(0.05 * fs)) refined <- vapply(peaks, function(p) { - lo <- max(1L, p - half); hi <- min(N, p + half) + lo <- max(1L, p - half) + hi <- min(N, p + half) lo + which.max(abs(bp[lo:hi])) - 1L }, integer(1)) rr_ms <- if (length(refined) > 1) diff(refined) * 1000 / fs else numeric(0) hr <- if (length(rr_ms)) 60000 / mean(rr_ms) else NA_real_ - list(r_peaks = refined, rr_intervals_ms = rr_ms, - heart_rate_bpm = hr, integrated = integ, fs = fs) + list( + r_peaks = refined, rr_intervals_ms = rr_ms, + heart_rate_bpm = hr, integrated = integ, fs = fs + ) } #' @rdname rgqrs #' @keywords internal #' @export -rangayyan_qrs_detect <- rgqrs +morie_rangayyan_qrs_detect <- rgqrs diff --git a/r-package/morie/R/rgsam.R b/r-package/morie/R/rgsam.R index cbb7ec7d8a..ed8a25fe97 100644 --- a/r-package/morie/R/rgsam.R +++ b/r-package/morie/R/rgsam.R @@ -15,7 +15,8 @@ #' @references Richman & Moorman (2000), AJP Heart 278:H2039. #' @export #' @examples -#' set.seed(0); rgsam(rnorm(100), m = 2)$SampEn +#' set.seed(0) +#' rgsam(rnorm(100), m = 2)$SampEn rgsam <- function(x, m = 2L, r = NULL) { N <- length(x) if (is.null(r)) r <- 0.2 * stats::sd(x) @@ -32,7 +33,8 @@ rgsam <- function(x, m = 2L, r = NULL) { } cnt } - B <- matches(m); A <- matches(m + 1L) + B <- matches(m) + A <- matches(m + 1L) sampen <- if (A == 0 || B == 0) Inf else -log(A / B) list(SampEn = sampen, A = A, B = B, m = m, r = r, n = N) } @@ -40,4 +42,4 @@ rgsam <- function(x, m = 2L, r = NULL) { #' @rdname rgsam #' @keywords internal #' @export -rangayyan_sample_entropy <- rgsam +morie_rangayyan_sample_entropy <- rgsam diff --git a/r-package/morie/R/rgstf.R b/r-package/morie/R/rgstf.R index b44bec1862..595358c0fe 100644 --- a/r-package/morie/R/rgstf.R +++ b/r-package/morie/R/rgstf.R @@ -14,8 +14,10 @@ #' @export #' @examples #' \donttest{ -#' t <- seq(0, 10, length.out = 1024); x <- sin(2 * pi * 10 * t) -#' r <- rgstf(x, fs = 100, nperseg = 128); dim(r$Sxx) +#' t <- seq(0, 10, length.out = 1024) +#' x <- sin(2 * pi * 10 * t) +#' r <- rgstf(x, fs = 100, nperseg = 128) +#' dim(r$Sxx) #' } rgstf <- function(x, fs = 1.0, nperseg = 256L, noverlap = NULL, window = "hann") { @@ -25,10 +27,11 @@ rgstf <- function(x, fs = 1.0, nperseg = 256L, noverlap = NULL, starts <- seq(1, length(x) - nperseg + 1, by = step) if (length(starts) < 1) starts <- 1 w <- switch(window, - hann = 0.5 - 0.5 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), - hamming = 0.54 - 0.46 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), - boxcar = rep(1, nperseg), - rep(1, nperseg)) + hann = 0.5 - 0.5 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), + hamming = 0.54 - 0.46 * cos(2 * pi * (seq_len(nperseg) - 1) / (nperseg - 1)), + boxcar = rep(1, nperseg), + rep(1, nperseg) + ) W <- sum(w^2) nfreq <- nperseg %/% 2L + 1L freqs <- seq(0, fs / 2, length.out = nfreq) @@ -43,11 +46,13 @@ rgstf <- function(x, fs = 1.0, nperseg = 256L, noverlap = NULL, Sxx[, i] <- pxx times[i] <- (s + nperseg / 2 - 1) / fs } - list(freqs = freqs, times = times, Sxx = Sxx, - nperseg = nperseg, noverlap = as.integer(noverlap), fs = fs) + list( + freqs = freqs, times = times, Sxx = Sxx, + nperseg = nperseg, noverlap = as.integer(noverlap), fs = fs + ) } #' @rdname rgstf #' @keywords internal #' @export -rangayyan_stft <- rgstf +morie_rangayyan_stft <- rgstf diff --git a/r-package/morie/R/rgwav.R b/r-package/morie/R/rgwav.R index 80a62a7b81..6665f1435a 100644 --- a/r-package/morie/R/rgwav.R +++ b/r-package/morie/R/rgwav.R @@ -2,7 +2,7 @@ #' #' Donoho-Johnstone soft/hard thresholding via the `wavelets` package #' (Daubechies DWT). Noise scale estimated from the finest-scale detail -#' as sigma = MAD / 0.6745; universal threshold T = sigma sqrt(2 ln N). +#' as sigma = MAD / 0.6745; universal threshold thr = sigma sqrt(2 ln N). #' #' Falls back to a 5-tap moving-average smoother (with a warning) if #' `wavelets` is not installed -- keeps R<->Py parity to ~5 % on smooth @@ -18,11 +18,13 @@ #' @export #' @examples #' \donttest{ -#' set.seed(0); t <- seq(0, 1, length.out = 256) -#' x <- sin(2 * pi * 3 * t) + 0.3 * rnorm(256) -#' if (requireNamespace("wavelets", quietly = TRUE)) { -#' r <- rgwav(x, level = 3); length(r$signal) -#' } +#' set.seed(0) +#' t <- seq(0, 1, length.out = 256) +#' x <- sin(2 * pi * 3 * t) + 0.3 * rnorm(256) +#' if (requireNamespace("wavelets", quietly = TRUE)) { +#' r <- rgwav(x, level = 3) +#' length(r$signal) +#' } #' } rgwav <- function(x, wavelet = "d8", level = NULL, mode = c("soft", "hard")) { mode <- match.arg(mode) @@ -31,12 +33,14 @@ rgwav <- function(x, wavelet = "d8", level = NULL, mode = c("soft", "hard")) { k <- rep(1 / 5, 5) y <- as.numeric(stats::filter(x, k, sides = 2)) y[is.na(y)] <- x[is.na(y)] - return(list(signal = y, threshold = NA_real_, sigma = NA_real_, - wavelet = wavelet, level = 0L, mode = "MA-fallback")) + return(list( + signal = y, threshold = NA_real_, sigma = NA_real_, + wavelet = wavelet, level = 0L, mode = "MA-fallback" + )) } # Pad to next power of two (wavelets::dwt requires it) N0 <- length(x) - N <- 2^ceiling(log2(N0)) + N <- 2^ceiling(log2(N0)) xp <- c(x, rep(0, N - N0)) if (is.null(level)) { level <- min(4L, as.integer(log2(N)) - 1L) @@ -46,20 +50,25 @@ rgwav <- function(x, wavelet = "d8", level = NULL, mode = c("soft", "hard")) { # MAD on the finest detail coefficients (W[[1]]) d1 <- as.numeric(fit@W[[1]]) sigma <- stats::median(abs(d1)) / 0.6745 - T <- sigma * sqrt(2 * log(N)) - thresh <- function(d, T, mode) { - if (mode == "soft") sign(d) * pmax(abs(d) - T, 0) - else d * (abs(d) > T) + thr <- sigma * sqrt(2 * log(N)) + thresh <- function(d, thr, mode) { + if (mode == "soft") { + sign(d) * pmax(abs(d) - thr, 0) + } else { + d * (abs(d) > thr) + } } for (i in seq_along(fit@W)) { - fit@W[[i]][, 1] <- thresh(as.numeric(fit@W[[i]]), T, mode) + fit@W[[i]][, 1] <- thresh(as.numeric(fit@W[[i]]), thr, mode) } y <- as.numeric(wavelets::idwt(fit))[seq_len(N0)] - list(signal = y, threshold = T, sigma = sigma, - wavelet = wavelet, level = level, mode = mode) + list( + signal = y, threshold = thr, sigma = sigma, + wavelet = wavelet, level = level, mode = mode + ) } #' @rdname rgwav #' @keywords internal #' @export -rangayyan_wavelet_denoise <- rgwav +morie_rangayyan_wavelet_denoise <- rgwav diff --git a/r-package/morie/R/rgzcr.R b/r-package/morie/R/rgzcr.R index 32deedad81..3fd89d3e68 100644 --- a/r-package/morie/R/rgzcr.R +++ b/r-package/morie/R/rgzcr.R @@ -14,17 +14,22 @@ rgzcr <- function(x, fs = 1.0) { n <- length(x) if (n < 2) { - return(list(zcr = NA_real_, zcr_per_second = NA_real_, - crossings = 0L, n = n)) + return(list( + zcr = NA_real_, zcr_per_second = NA_real_, + crossings = 0L, n = n + )) } - s <- sign(x); s[s == 0] <- 1 + s <- sign(x) + s[s == 0] <- 1 crossings <- sum(abs(diff(s)) > 0) zcr <- crossings / (n - 1) - list(zcr = zcr, zcr_per_second = zcr * fs, - crossings = as.integer(crossings), n = n) + list( + zcr = zcr, zcr_per_second = zcr * fs, + crossings = as.integer(crossings), n = n + ) } #' @rdname rgzcr #' @keywords internal #' @export -rangayyan_zero_crossing <- rgzcr +morie_rangayyan_zero_crossing <- rgzcr diff --git a/r-package/morie/R/rgztn.R b/r-package/morie/R/rgztn.R index 3549f06d12..961d81cb51 100644 --- a/r-package/morie/R/rgztn.R +++ b/r-package/morie/R/rgztn.R @@ -13,32 +13,36 @@ #' @return Named list: estimate, coef_path, alphas, penalty, l1_ratio, #' n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -regularization_path <- function(x, y, penalty = c("ridge", "lasso", "elasticnet"), +morie_regularization_path <- function(x, y, penalty = c("ridge", "lasso", "elasticnet"), alphas = NULL, l1_ratio = 0.5) { + x <- .morie_ensure_design_matrix(x) if (!requireNamespace("glmnet", quietly = TRUE)) { - stop("Function 'regularization_path' requires package 'glmnet'. Install with install.packages('glmnet').") + stop("Function 'morie_regularization_path' requires package 'glmnet'. Install with install.packages('glmnet').") } penalty <- match.arg(penalty) if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.numeric(y) - n <- nrow(x); p <- ncol(x) + x <- as.matrix(x) + y <- as.numeric(y) + n <- nrow(x) + p <- ncol(x) if (is.null(alphas)) alphas <- 10^seq(-3, 2, length.out = 50) gn_alpha <- switch(penalty, - ridge = 0, - lasso = 1, - elasticnet = l1_ratio) - fit <- glmnet::glmnet(x, y, alpha = gn_alpha, lambda = sort(alphas, decreasing = TRUE), - standardize = FALSE, intercept = TRUE) + ridge = 0, + lasso = 1, + elasticnet = l1_ratio + ) + fit <- glmnet::glmnet(x, y, + alpha = gn_alpha, lambda = sort(alphas, decreasing = TRUE), + standardize = FALSE, intercept = TRUE + ) # glmnet returns columns in decreasing-lambda order; re-order to match alphas ord <- order(fit$lambda) lam <- fit$lambda[ord] beta <- as.matrix(fit$beta)[, ord, drop = FALSE] - a0 <- fit$a0[ord] + a0 <- fit$a0[ord] coef_path <- rbind(a0, beta) coef_path <- t(coef_path) colnames(coef_path) <- c("(intercept)", colnames(x) %||% paste0("x", seq_len(p) - 1L)) diff --git a/r-package/morie/R/rkhsc.R b/r-package/morie/R/rkhsc.R index 2f3ac13d18..6d7e26c15b 100644 --- a/r-package/morie/R/rkhsc.R +++ b/r-package/morie/R/rkhsc.R @@ -12,9 +12,11 @@ #' @keywords internal rkhsc <- function(x, y, sigma = NULL, lam = 1e-3) { if (!is.matrix(x)) x <- matrix(x, ncol = 1) - y <- as.numeric(y); n <- nrow(x) - if (n < 2L || length(y) != n) + y <- as.numeric(y) + n <- nrow(x) + if (n < 2L || length(y) != n) { return(list(estimate = NA_real_, n = n, method = "RKHS KRR (n<2)")) + } D2 <- as.matrix(stats::dist(x))^2 if (is.null(sigma)) { med <- stats::median(sqrt(D2[D2 > 0])) @@ -24,15 +26,18 @@ rkhsc <- function(x, y, sigma = NULL, lam = 1e-3) { alpha <- solve(K + n * lam * diag(n), y) fitted <- as.numeric(K %*% alpha) resid <- y - fitted - sse <- sum(resid^2); sst <- sum((y - mean(y))^2) + sse <- sum(resid^2) + sst <- sum((y - mean(y))^2) r2 <- if (sst > 0) 1 - sse / sst else NA_real_ - list(alpha = alpha, fitted = fitted, residuals = resid, - sigma = as.numeric(sigma), lambda = lam, - sse = sse, r2 = as.numeric(r2), - estimate = mean(fitted), - se = sqrt(sse / max(1, n - 1)) / sqrt(n), - n = as.integer(n), - method = "RKHS kernel ridge (Wahba 1990)") + list( + alpha = alpha, fitted = fitted, residuals = resid, + sigma = as.numeric(sigma), lambda = lam, + sse = sse, r2 = as.numeric(r2), + estimate = mean(fitted), + se = sqrt(sse / max(1, n - 1)) / sqrt(n), + n = as.integer(n), + method = "RKHS kernel ridge (Wahba 1990)" + ) } # CANONICAL TEST @@ -43,4 +48,4 @@ rkhsc <- function(x, y, sigma = NULL, lam = 1e-3) { #' @rdname rkhsc #' @keywords internal #' @export -rkhs_kernel_regression <- rkhsc +morie_rkhs_kernel_regression <- rkhsc diff --git a/r-package/morie/R/rkhsf.R b/r-package/morie/R/rkhsf.R index 4a5cc85744..c9e9934b67 100644 --- a/r-package/morie/R/rkhsf.R +++ b/r-package/morie/R/rkhsf.R @@ -10,13 +10,11 @@ #' @return list(estimate, alpha, beta, K, f_hat, se, h, n, method). #' @references Gianola & van Kaam (2008). Montesinos Lopez Ch 5. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rkhs_full(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -rkhs_full <- function(x, y, markers, h = NULL, lam = 1) { - y <- as.numeric(y); n <- length(y) +morie_rkhs_full <- function(x, y, markers, h = NULL, lam = 1) { + y <- as.numeric(y) + n <- length(y) M <- as.matrix(markers) sq <- rowSums(M^2) D2 <- pmax(outer(sq, sq, "+") - 2 * tcrossprod(M), 0) @@ -28,8 +26,11 @@ rkhs_full <- function(x, y, markers, h = NULL, lam = 1) { K <- exp(-D2 / h) cand <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { matrix(1, n, 1) - } else cbind(1, as.matrix(x)) - qrx <- qr(cand); X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] + } else { + cbind(1, as.matrix(x)) + } + qrx <- qr(cand) + X <- cand[, qrx$pivot[seq_len(qrx$rank)], drop = FALSE] beta <- stats::lsfit(X, y, intercept = FALSE)$coefficients r <- y - as.numeric(X %*% beta) alpha <- solve(K + lam * diag(n), r) @@ -37,11 +38,13 @@ rkhs_full <- function(x, y, markers, h = NULL, lam = 1) { y_hat <- as.numeric(X %*% beta) + f_hat resid <- y - y_hat se <- sqrt(sum(resid^2) / max(n - ncol(X), 1)) - list(estimate = mean(f_hat), alpha = alpha, beta = beta, K = K, - f_hat = f_hat, se = se, h = h, n = n, - method = "RKHS regression (Gaussian kernel)") + list( + estimate = mean(f_hat), alpha = alpha, beta = beta, K = K, + f_hat = f_hat, se = se, h = h, n = n, + method = "RKHS regression (Gaussian kernel)" + ) } # CANONICAL TEST # set.seed(1); M <- matrix(sample(0:2, 20, TRUE), 5, 4) -# rkhs_full(rep(0,5), c(1,2,1.5,2.5,2), M) +# morie_rkhs_full(rep(0,5), c(1,2,1.5,2.5,2), M) diff --git a/r-package/morie/R/rlhfd.R b/r-package/morie/R/rlhfd.R index e422c03d50..d815d8c69e 100644 --- a/r-package/morie/R/rlhfd.R +++ b/r-package/morie/R/rlhfd.R @@ -13,6 +13,8 @@ rlhf_reward <- function(x, w = NULL, b = 0) { if (is.null(w)) w <- rep(1 / d, d) if (length(w) != d) stop(sprintf("w must have length %d", d)) r <- as.numeric(xm %*% w + b) - list(value = r[1L], tensor = r, - w = as.numeric(w), b = b, method = "rlhf-reward-head") + list( + value = r[1L], tensor = r, + w = as.numeric(w), b = b, method = "rlhf-reward-head" + ) } diff --git a/r-package/morie/R/rndsr.R b/r-package/morie/R/rndsr.R index 13627a95c0..5c8d44b4f9 100644 --- a/r-package/morie/R/rndsr.R +++ b/r-package/morie/R/rndsr.R @@ -18,36 +18,43 @@ #' @return Named list: estimate, best_params, best_score, sampled_params, #' sampled_scores, n_iter, task, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -random_search_cv <- function(x, y, method = NULL, n_iter = 20L, cv = 5L, - task = "auto", seed = 0L, - deterministic_seed = NULL) { +morie_random_search_cv <- function(x, y, method = NULL, n_iter = 20L, cv = 5L, + task = "auto", seed = 0L, + deterministic_seed = NULL) { + x <- .morie_ensure_design_matrix(x) if (!requireNamespace("caret", quietly = TRUE)) { - stop("Function 'random_search_cv' requires package 'caret'. Install with install.packages('caret').") + stop("Function 'morie_random_search_cv' requires package 'caret'. Install with install.packages('caret').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); colnames(x) <- colnames(x) %||% paste0("x", seq_len(ncol(x)) - 1L) + x <- as.matrix(x) + colnames(x) <- colnames(x) %||% paste0("x", seq_len(ncol(x)) - 1L) if (identical(task, "auto")) { - task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) - "classification" else "regression" + task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) { + "classification" + } else { + "regression" + } } if (!is.null(deterministic_seed)) { morie_det_rng("rndsr", deterministic_seed) } else { set.seed(seed) } - ctrl <- caret::trainControl(method = "cv", number = cv, - search = "random", classProbs = FALSE) + ctrl <- caret::trainControl( + method = "cv", number = cv, + search = "random", classProbs = FALSE + ) if (is.null(method)) { method <- if (task == "classification") "glmnet" else "ridge" } y_use <- if (task == "classification") factor(make.names(as.character(y))) else as.numeric(y) - fit <- caret::train(x = x, y = y_use, method = method, - tuneLength = n_iter, trControl = ctrl) + fit <- caret::train( + x = x, y = y_use, method = method, + tuneLength = n_iter, trControl = ctrl + ) best <- fit$bestTune results <- fit$results metric <- if (task == "classification") "Accuracy" else "RMSE" diff --git a/r-package/morie/R/rnkbs.R b/r-package/morie/R/rnkbs.R index 75a1103139..87bdef7a85 100644 --- a/r-package/morie/R/rnkbs.R +++ b/r-package/morie/R/rnkbs.R @@ -9,18 +9,17 @@ #' @return Named list: statistic (tau), p_value, n, inversions, z. #' @importFrom stats cor.test #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rank_based_test(x = rnorm(50)) #' @export -rank_based_test <- function(x) { +morie_rank_based_test <- function(x) { x <- as.numeric(x) n <- length(x) if (n < 3) { - return(list(statistic = NA_real_, p_value = NA_real_, n = n, - inversions = 0L, z = NA_real_, - method = "Mann's rank test for randomness")) + return(list( + statistic = NA_real_, p_value = NA_real_, n = n, + inversions = 0L, z = NA_real_, + method = "Mann's rank test for randomness" + )) } t <- seq_len(n) ct <- suppressWarnings(stats::cor.test(t, x, method = "kendall")) diff --git a/r-package/morie/R/rnkor.R b/r-package/morie/R/rnkor.R index 1a854ffca9..bfa0e25987 100644 --- a/r-package/morie/R/rnkor.R +++ b/r-package/morie/R/rnkor.R @@ -9,19 +9,18 @@ #' @param mu0 Hypothesised median (default 0). #' @return Named list: signed_ranks, abs_ranks, W_plus, W_minus, n_nonzero, n. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rank_order_statistics(x = rnorm(50)) #' @export -rank_order_statistics <- function(x, mu0 = 0) { +morie_rank_order_statistics <- function(x, mu0 = 0) { x <- as.numeric(x) n <- length(x) if (n < 2) { - return(list(signed_ranks = numeric(0), abs_ranks = numeric(0), - W_plus = NA_real_, W_minus = NA_real_, - n_nonzero = 0L, n = n, - method = "Rank-order signed ranks")) + return(list( + signed_ranks = numeric(0), abs_ranks = numeric(0), + W_plus = NA_real_, W_minus = NA_real_, + n_nonzero = 0L, n = n, + method = "Rank-order signed ranks" + )) } d <- x - mu0 nz <- d != 0 diff --git a/r-package/morie/R/rnnge.R b/r-package/morie/R/rnnge.R index 6f184f6966..20fcc3b146 100644 --- a/r-package/morie/R/rnnge.R +++ b/r-package/morie/R/rnnge.R @@ -13,22 +13,23 @@ #' @return list(estimate, y_hat, W_h, W_x, b_h, w_o, b_o, se, n, method). #' @references Montesinos Lopez Ch 14. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rnn_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -rnn_genomic <- function(x, y, markers, hidden = 8, n_epochs = 150, - lr = 1e-2, l2 = 1e-3, seed = 0, - deterministic_seed = NULL) { +morie_rnn_genomic <- function(x, y, markers, hidden = 8, n_epochs = 150, + lr = 1e-2, l2 = 1e-3, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("rnnge", deterministic_seed) } else { set.seed(seed) } - y <- as.numeric(y); n <- length(y) - M <- as.matrix(markers); L <- ncol(M) - M_mu <- colMeans(M); M_sd <- apply(M, 2, stats::sd); M_sd[M_sd == 0] <- 1 + y <- as.numeric(y) + n <- length(y) + M <- as.matrix(markers) + L <- ncol(M) + M_mu <- colMeans(M) + M_sd <- apply(M, 2, stats::sd) + M_sd[M_sd == 0] <- 1 Ms <- sweep(sweep(M, 2, M_mu), 2, M_sd, "/") H <- hidden Wh <- matrix(stats::rnorm(H * H, 0, 1 / sqrt(H)), H, H) @@ -48,11 +49,15 @@ rnn_genomic <- function(x, y, markers, hidden = 8, n_epochs = 150, y_hat <- as.numeric(h_state %*% wo) + bo resid <- y_hat - y dy <- resid / n - dwo <- as.numeric(crossprod(h_state, dy)) + l2 * wo; dbo <- sum(dy) + dwo <- as.numeric(crossprod(h_state, dy)) + l2 * wo + dbo <- sum(dy) dh <- outer(dy, wo) - dWh <- matrix(0, H, H); dWx <- 0; dbh <- rep(0, H) + dWh <- matrix(0, H, H) + dWx <- 0 + dbh <- rep(0, H) for (t in rev(seq_len(L))) { - h_t <- hs[[t + 1]]; h_prev <- hs[[t]] + h_t <- hs[[t + 1]] + h_prev <- hs[[t]] dh_raw <- dh * (1 - h_t^2) dWh <- dWh + crossprod(h_prev, dh_raw) dWx <- dWx + sum(Ms[, t] * dh_raw) @@ -63,7 +68,8 @@ rnn_genomic <- function(x, y, markers, hidden = 8, n_epochs = 150, Wh <- Wh - lr * dWh Wx <- Wx - lr * (dWx + l2 * Wx) bh <- bh - lr * dbh - wo <- wo - lr * dwo; bo <- bo - lr * dbo + wo <- wo - lr * dwo + bo <- bo - lr * dbo losses[ep] <- mean(resid^2) } h_state <- matrix(0, n, H) @@ -73,12 +79,14 @@ rnn_genomic <- function(x, y, markers, hidden = 8, n_epochs = 150, } y_hat <- as.numeric(h_state %*% wo) + bo resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, - W_h = Wh, W_x = Wx, b_h = bh, w_o = wo, b_o = bo, - loss_curve = losses, se = sqrt(mean(resid^2)), - n = n, method = "Vanilla RNN BPTT (base R)") + list( + estimate = mean(y_hat), y_hat = y_hat, + W_h = Wh, W_x = Wx, b_h = bh, w_o = wo, b_o = bo, + loss_curve = losses, se = sqrt(mean(resid^2)), + n = n, method = "Vanilla RNN BPTT (base R)" + ) } # CANONICAL TEST # set.seed(8); M <- matrix(rnorm(90), 15, 6); y <- rowSums(M)+0.2*rnorm(15) -# rnn_genomic(rep(0,15), y, M, n_epochs=20, seed=8) +# morie_rnn_genomic(rep(0,15), y, M, n_epochs=20, seed=8) diff --git a/r-package/morie/R/rocau.R b/r-package/morie/R/rocau.R index 24954191fd..76f3bd581d 100644 --- a/r-package/morie/R/rocau.R +++ b/r-package/morie/R/rocau.R @@ -9,23 +9,24 @@ #' @return Named list: estimate, auc, fpr, tpr, thresholds, n, #' n_positive, n_negative, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -roc_auc_score <- function(y_true, y_score) { +morie_roc_auc_score <- function(y_true, y_score) { if (!requireNamespace("pROC", quietly = TRUE)) { - stop("Function 'roc_auc_score' requires package 'pROC'. Install with install.packages('pROC').") + stop("Function 'morie_roc_auc_score' requires package 'pROC'. Install with install.packages('pROC').") } - yt <- as.numeric(y_true); ys <- as.numeric(y_score) + yt <- as.numeric(y_true) + ys <- as.numeric(y_score) classes <- sort(unique(yt)) - if (length(classes) != 2) stop("roc_auc_score requires binary y_true") + if (length(classes) != 2) stop("morie_roc_auc_score requires binary y_true") pos <- classes[2] yt_b <- as.integer(yt == pos) - rc <- pROC::roc(response = yt_b, predictor = ys, - levels = c(0, 1), direction = "<", - quiet = TRUE) + rc <- pROC::roc( + response = yt_b, predictor = ys, + levels = c(0, 1), direction = "<", + quiet = TRUE + ) auc <- as.numeric(pROC::auc(rc)) # pROC reports specificity (= 1 - FPR) and sensitivity (= TPR) per threshold fpr <- 1 - rc$specificities diff --git a/r-package/morie/R/rotrp.R b/r-package/morie/R/rotrp.R index 1d0c689028..12f2af0f1e 100644 --- a/r-package/morie/R/rotrp.R +++ b/r-package/morie/R/rotrp.R @@ -12,35 +12,37 @@ #' @return Named list \code{(y, estimate, angles, method)}. #' @references Su et al. (2021), arXiv:2104.09864. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -rotrp_rotary_position_embedding <- function(x, base = 10000) { +morie_rotrp_rotary_position_embedding <- function(x, base = 10000) { x <- as.matrix(x) - seq_len <- nrow(x); d <- ncol(x) + seq_len <- nrow(x) + d <- ncol(x) if (d %% 2L != 0L) stop(sprintf("d_model must be even for RoPE, got %d", d)) half <- d %/% 2L pos <- seq_len(seq_len) - 1L idx <- seq_len(half) - 1L - inv_freq <- 1 / base ^ ((2 * idx) / d) + inv_freq <- 1 / base^((2 * idx) / d) angles <- outer(pos, inv_freq) - cos_a <- cos(angles); sin_a <- sin(angles) + cos_a <- cos(angles) + sin_a <- sin(angles) even <- seq(1L, d, by = 2L) odd <- seq(2L, d, by = 2L) x_even <- x[, even, drop = FALSE] - x_odd <- x[, odd, drop = FALSE] + x_odd <- x[, odd, drop = FALSE] y_even <- x_even * cos_a - x_odd * sin_a - y_odd <- x_even * sin_a + x_odd * cos_a + y_odd <- x_even * sin_a + x_odd * cos_a y <- x y[, even] <- y_even - y[, odd] <- y_odd - list(y = y, estimate = y, angles = angles, - method = "Rotary position embedding") + y[, odd] <- y_odd + list( + y = y, estimate = y, angles = angles, + method = "Rotary position embedding" + ) } -#' @rdname rotrp_rotary_position_embedding +#' @rdname morie_rotrp_rotary_position_embedding #' @keywords internal #' @export -rotary_position_embedding <- rotrp_rotary_position_embedding +morie_rotary_position_embedding <- morie_rotrp_rotary_position_embedding diff --git a/r-package/morie/R/rptpn.R b/r-package/morie/R/rptpn.R index f0371a2c59..fc32a4e6fa 100644 --- a/r-package/morie/R/rptpn.R +++ b/r-package/morie/R/rptpn.R @@ -9,12 +9,19 @@ #' @keywords internal repetition_penalty <- function(x, generated, alpha = 1.2) { z <- as.numeric(x) - if (alpha == 1) return(list(tensor = z, penalised_idx = integer(0), - alpha = alpha, method = "rep-penalty")) + if (alpha == 1) { + return(list( + tensor = z, penalised_idx = integer(0), + alpha = alpha, method = "rep-penalty" + )) + } idx <- unique(as.integer(generated)) idx <- idx[idx >= 0L & idx < length(z)] - for (i in idx) + for (i in idx) { z[i + 1L] <- if (z[i + 1L] > 0) z[i + 1L] / alpha else z[i + 1L] * alpha - list(tensor = z, penalised_idx = idx, alpha = alpha, - method = "rep-penalty") + } + list( + tensor = z, penalised_idx = idx, alpha = alpha, + method = "rep-penalty" + ) } diff --git a/r-package/morie/R/rslnk.R b/r-package/morie/R/rslnk.R index 16e762f830..e9881f6291 100644 --- a/r-package/morie/R/rslnk.R +++ b/r-package/morie/R/rslnk.R @@ -11,22 +11,22 @@ #' @return Named list \code{(y, estimate, Fx, method)}. #' @references He, Zhang, Ren & Sun (2016), CVPR. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_rslnk_residual_connection(x = rnorm(50)) #' @export -rslnk_residual_connection <- function(x, f = NULL) { +morie_rslnk_residual_connection <- function(x, f = NULL) { x <- as.array(x) Fx <- if (is.null(f)) x else as.array(f(x)) - if (!identical(dim(Fx), dim(x)) && length(Fx) != length(x)) + if (!identical(dim(Fx), dim(x)) && length(Fx) != length(x)) { stop("Residual branch shape does not match identity shape.") + } y <- Fx + x - list(y = y, estimate = y, Fx = Fx, - method = "Residual identity shortcut") + list( + y = y, estimate = y, Fx = Fx, + method = "Residual identity shortcut" + ) } -#' @rdname rslnk_residual_connection +#' @rdname morie_rslnk_residual_connection #' @keywords internal #' @export -residual_connection <- rslnk_residual_connection +morie_residual_connection <- morie_rslnk_residual_connection diff --git a/r-package/morie/R/sampling.R b/r-package/morie/R/sampling.R index 624754bd93..e6d94ac28c 100644 --- a/r-package/morie/R/sampling.R +++ b/r-package/morie/R/sampling.R @@ -22,8 +22,8 @@ NULL #' @export #' @examples #' df <- data.frame(x = 1:100) -#' srs_sample <- simple_random_sample(df, 20) -simple_random_sample <- function(df, n, replace = FALSE, seed = 42L) { +#' srs_sample <- morie_simple_random_sample(df, 20) +morie_simple_random_sample <- function(df, n, replace = FALSE, seed = 42L) { set.seed(seed) N <- nrow(df) if (n > N && !replace) stop("n exceeds population size for SRS WOR.") @@ -52,9 +52,9 @@ simple_random_sample <- function(df, n, replace = FALSE, seed = 42L) { #' @export #' @examples #' df <- data.frame(g = c(rep("A", 60), rep("B", 40)), x = rnorm(100)) -#' stratified_sample(df, "g", n_per_stratum = 10) -stratified_sample <- function(df, strata_col, n_per_stratum, - proportional = FALSE, seed = 42L) { +#' morie_stratified_sample(df, "g", n_per_stratum = 10) +morie_stratified_sample <- function(df, strata_col, n_per_stratum, + proportional = FALSE, seed = 42L) { set.seed(seed) strata <- split(seq_len(nrow(df)), df[[strata_col]]) strata_sizes <- lengths(strata) @@ -62,7 +62,9 @@ stratified_sample <- function(df, strata_col, n_per_stratum, if (proportional) { total_n <- if (is.numeric(n_per_stratum) && length(n_per_stratum) == 1L) { n_per_stratum - } else stop("For proportional = TRUE, supply a single integer for n_per_stratum.") + } else { + stop("For proportional = TRUE, supply a single integer for n_per_stratum.") + } alloc <- round(strata_sizes / sum(strata_sizes) * total_n) alloc <- pmax(alloc, 1L) } else { @@ -99,12 +101,10 @@ stratified_sample <- function(df, strata_col, n_per_stratum, #' @param seed Random seed. #' @return Data frame of selected units with `.weight` column. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -cluster_sample <- function(df, cluster_col, n_clusters, seed = 42L) { +morie_cluster_sample <- function(df, cluster_col, n_clusters, seed = 42L) { set.seed(seed) all_clusters <- unique(df[[cluster_col]]) N_clusters <- length(all_clusters) @@ -128,12 +128,10 @@ cluster_sample <- function(df, cluster_col, n_clusters, seed = 42L) { #' @param seed Random seed. #' @return Data frame of selected units with `.weight` (Hansen-Hurwitz weights). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -pps_sample <- function(df, size_col, n, seed = 42L) { +morie_pps_sample <- function(df, size_col, n, seed = 42L) { set.seed(seed) sizes <- as.numeric(df[[size_col]]) if (any(sizes <= 0, na.rm = TRUE)) stop("size_col must be positive.") @@ -161,22 +159,22 @@ pps_sample <- function(df, size_col, n, seed = 42L) { #' @export #' @examples #' df <- data.frame(x = rnorm(100)) -#' bootstrap_sample(df, statistic = function(d) mean(d$x)) -bootstrap_sample <- function(df, statistic, n_bootstrap = 1000L, seed = 42L) { +#' morie_bootstrap_sample(df, statistic = function(d) mean(d$x)) +morie_bootstrap_sample <- function(df, statistic, n_bootstrap = 1000L, seed = 42L) { set.seed(seed) n <- nrow(df) boot_stats <- vapply(seq_len(n_bootstrap), function(i) { idx <- sample.int(n, n, replace = TRUE) statistic(df[idx, , drop = FALSE]) }, numeric(1)) - est <- mean(boot_stats) - se <- stats::sd(boot_stats) - ci <- stats::quantile(boot_stats, c(0.025, 0.975)) + est <- mean(boot_stats) + se <- stats::sd(boot_stats) + ci <- stats::quantile(boot_stats, c(0.025, 0.975)) list( - estimate = est, - se = se, - ci_lower = ci[1], - ci_upper = ci[2], + estimate = est, + se = se, + ci_lower = ci[1], + ci_upper = ci[2], distribution = boot_stats ) } @@ -192,19 +190,17 @@ bootstrap_sample <- function(df, statistic, n_bootstrap = 1000L, seed = 42L) { #' @param statistic A function taking a data frame and returning a scalar. #' @return Named list: `estimate`, `se`, `bias`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -jackknife_estimate <- function(df, statistic) { +morie_jackknife_estimate <- function(df, statistic) { n <- nrow(df) theta_full <- statistic(df) theta_minus_i <- vapply(seq_len(n), function(i) { statistic(df[-i, , drop = FALSE]) }, numeric(1)) theta_bar <- mean(theta_minus_i) - se <- sqrt((n - 1) / n * sum((theta_minus_i - theta_bar)^2)) + se <- sqrt((n - 1) / n * sum((theta_minus_i - theta_bar)^2)) bias <- (n - 1) * (theta_bar - theta_full) list(estimate = theta_full, se = se, bias = bias) } @@ -219,12 +215,10 @@ jackknife_estimate <- function(df, statistic) { #' @param weights Numeric vector of sampling weights. #' @return Numeric ESS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -effective_sample_size <- function(weights) { +morie_effective_sample_size <- function(weights) { w <- as.numeric(weights) w <- w[!is.na(w) & w > 0] (sum(w)^2) / sum(w^2) @@ -235,15 +229,13 @@ effective_sample_size <- function(weights) { #' @param weights Numeric vector of sampling weights. #' @return Numeric design effect (= n / ESS). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -design_effect <- function(weights) { +morie_design_effect <- function(weights) { w <- as.numeric(weights) w <- w[!is.na(w) & w > 0] - length(w) / effective_sample_size(w) + length(w) / morie_effective_sample_size(w) } @@ -258,12 +250,10 @@ design_effect <- function(weights) { #' @param population_sizes Named integer vector: stratum level -> population size. #' @return Numeric vector of design weights (same length as `nrow(df)`). #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -compute_design_weights <- function(df, strata_col, population_sizes) { +morie_compute_design_weights <- function(df, strata_col, population_sizes) { strata <- df[[strata_col]] sample_sizes <- table(strata) pop_sizes <- population_sizes[names(sample_sizes)] @@ -289,10 +279,21 @@ compute_design_weights <- function(df, strata_col, population_sizes) { #' @param max_iter Maximum IPF iterations. #' @param tol Convergence tolerance. #' @return Numeric vector of calibrated weights. +#' @examples +#' set.seed(1) +#' df <- data.frame( +#' region = sample(c("A", "B"), 100, TRUE), +#' sex = sample(c("M", "F"), 100, TRUE) +#' ) +#' totals <- list(region_A = 60, region_B = 40, sex_M = 55, sex_F = 45) +#' morie_calibration_weights(df, +#' aux_vars = c("region", "sex"), +#' population_totals = totals +#' ) #' @export -calibration_weights <- function(df, aux_vars, population_totals, - initial_weights = NULL, - max_iter = 50L, tol = 1e-6) { +morie_calibration_weights <- function(df, aux_vars, population_totals, + initial_weights = NULL, + max_iter = 50L, tol = 1e-6) { n <- nrow(df) w <- if (!is.null(initial_weights)) initial_weights else rep(1, n) @@ -304,7 +305,7 @@ calibration_weights <- function(df, aux_vars, population_totals, key <- paste0(v, "_", lv) if (!key %in% names(population_totals)) next pop_tot <- population_totals[[key]] - mask <- df[[v]] == lv + mask <- df[[v]] == lv sample_tot <- sum(w[mask]) if (sample_tot > 0) w[mask] <- w[mask] * pop_tot / sample_tot } diff --git a/r-package/morie/R/sarla.R b/r-package/morie/R/sarla.R index 226afa92d4..7404f876a4 100644 --- a/r-package/morie/R/sarla.R +++ b/r-package/morie/R/sarla.R @@ -1,4 +1,25 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: SAR-lag concentrated negative log-likelihood in rho. +# Extracted from the sarla() optimiser closure so the singular-system / +# non-positive-variance guard is directly unit-testable. `e0`, `e1` are +# the residual projections, `n` the sample size, `I`/`W` the identity +# and spatial-weights matrices. +.sarla_negll <- function(rho, e0, e1, n, I, W) { + e <- e0 - rho * e1 + sigma2 <- as.numeric(sum(e^2)) / n + A <- I - rho * W + det_sign <- determinant(A, logarithm = TRUE) + # !is.finite(modulus) catches a singular A (det == 0), for which + # determinant() still reports sign = +1; without it the -logdetA term + # below would be +Inf rather than the intended penalty. + if (det_sign$sign <= 0 || sigma2 <= 0 || !is.finite(det_sign$modulus)) { + return(1e12) + } + logdetA <- as.numeric(det_sign$modulus) + 0.5 * n * log(2 * pi * sigma2) - logdetA + 0.5 * n +} + #' Spatial autoregressive lag model (SAR lag, ML). #' #' Y = rho W Y + X beta + eps, eps ~ N(0, sigma2 I). @@ -10,41 +31,37 @@ #' @return Named list: estimate, se, rho, sigma2, n, method. #' @references Anselin (1988); Schabenberger & Gotway (2005), Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export sarla <- function(x, y, w) { - X <- as.matrix(x); y <- as.numeric(y); W <- as.matrix(w) - n <- length(y); p <- ncol(X) - if (nrow(X) != n || any(dim(W) != c(n, n))) + X <- as.matrix(x) + y <- as.numeric(y) + W <- as.matrix(w) + n <- length(y) + p <- ncol(X) + if (nrow(X) != n || any(dim(W) != c(n, n))) { stop("shape mismatch among x, y, w") + } I <- diag(n) XtX_inv <- solve(crossprod(X)) M <- I - X %*% XtX_inv %*% t(X) e0 <- M %*% y e1 <- M %*% (W %*% y) - neg_ll <- function(rho) { - e <- e0 - rho * e1 - sigma2 <- as.numeric(sum(e ^ 2)) / n - A <- I - rho * W - det_sign <- determinant(A, logarithm = TRUE) - if (det_sign$sign <= 0 || sigma2 <= 0) return(1e12) - logdetA <- as.numeric(det_sign$modulus) - 0.5 * n * log(2 * pi * sigma2) - logdetA + 0.5 * n - } + neg_ll <- function(rho) .sarla_negll(rho, e0, e1, n, I, W) res <- stats::optimize(neg_ll, interval = c(-0.99, 0.99)) rho <- res$minimum Wy <- W %*% y y_star <- as.numeric(y - rho * Wy) beta <- as.numeric(XtX_inv %*% crossprod(X, y_star)) e <- y_star - as.numeric(X %*% beta) - sigma2 <- sum(e ^ 2) / max(n - p - 1, 1) + sigma2 <- sum(e^2) / max(n - p - 1, 1) cov_b <- sigma2 * XtX_inv se <- sqrt(pmax(diag(cov_b), 0)) - list(estimate = beta, se = se, rho = rho, sigma2 = sigma2, n = n, - method = "SAR lag (ML, concentrated log-likelihood)") + list( + estimate = beta, se = se, rho = rho, sigma2 = sigma2, n = n, + method = "SAR lag (ML, concentrated log-likelihood)" + ) } # CANONICAL TEST (with row-standardised path graph) @@ -52,4 +69,4 @@ sarla <- function(x, y, w) { #' @rdname sarla #' @keywords internal #' @export -spatial_ar_lag <- sarla +morie_spatial_ar_lag <- sarla diff --git a/r-package/morie/R/sarre.R b/r-package/morie/R/sarre.R index 8c13335e24..185d564d62 100644 --- a/r-package/morie/R/sarre.R +++ b/r-package/morie/R/sarre.R @@ -1,4 +1,34 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: SAR-error concentrated negative log-likelihood in lambda. +# Extracted from the sarre() optimiser closure so the singular-GLS, +# non-positive-variance and non-positive-determinant guards are all +# directly unit-testable. +.sarre_negll <- function(lam, I, W, X, y, n) { + A <- I - lam * W + AX <- A %*% X + Ay <- A %*% y + beta <- tryCatch(solve(crossprod(AX), crossprod(AX, Ay)), + error = function(e) NULL + ) + if (is.null(beta)) { + return(1e12) + } + e <- Ay - AX %*% beta + sigma2 <- as.numeric(sum(e^2)) / n + if (sigma2 <= 0) { + return(1e12) + } + det_sign <- determinant(A, logarithm = TRUE) + # !is.finite(modulus) catches a singular A (det == 0), for which + # determinant() still reports sign = +1. + if (det_sign$sign <= 0 || !is.finite(det_sign$modulus)) { + return(1e12) + } + logdetA <- as.numeric(det_sign$modulus) + 0.5 * n * log(2 * pi * sigma2) - logdetA + 0.5 * n +} + #' Spatial autoregressive error model (SAR error, ML). #' #' Y = X beta + u, u = lambda W u + eps, eps ~ N(0, sigma2 I). @@ -11,42 +41,34 @@ #' @return Named list: estimate, se, lambda, sigma2, n, method. #' @references Anselin (1988); Schabenberger & Gotway (2005), Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export sarre <- function(x, y, w) { - X <- as.matrix(x); y <- as.numeric(y); W <- as.matrix(w) - n <- length(y); p <- ncol(X) - if (nrow(X) != n || any(dim(W) != c(n, n))) + X <- as.matrix(x) + y <- as.numeric(y) + W <- as.matrix(w) + n <- length(y) + p <- ncol(X) + if (nrow(X) != n || any(dim(W) != c(n, n))) { stop("shape mismatch among x, y, w") - I <- diag(n) - neg_ll <- function(lam) { - A <- I - lam * W - AX <- A %*% X; Ay <- A %*% y - beta <- tryCatch(solve(crossprod(AX), crossprod(AX, Ay)), - error = function(e) NULL) - if (is.null(beta)) return(1e12) - e <- Ay - AX %*% beta - sigma2 <- as.numeric(sum(e ^ 2)) / n - if (sigma2 <= 0) return(1e12) - det_sign <- determinant(A, logarithm = TRUE) - if (det_sign$sign <= 0) return(1e12) - logdetA <- as.numeric(det_sign$modulus) - 0.5 * n * log(2 * pi * sigma2) - logdetA + 0.5 * n } + I <- diag(n) + neg_ll <- function(lam) .sarre_negll(lam, I, W, X, y, n) res <- stats::optimize(neg_ll, interval = c(-0.99, 0.99)) lam <- res$minimum A <- I - lam * W - AX <- A %*% X; Ay <- A %*% y + AX <- A %*% X + Ay <- A %*% y beta <- as.numeric(solve(crossprod(AX), crossprod(AX, Ay))) e <- as.numeric(Ay - AX %*% beta) - sigma2 <- as.numeric(sum(e ^ 2)) / max(n - p, 1) + sigma2 <- as.numeric(sum(e^2)) / max(n - p, 1) cov_b <- sigma2 * solve(crossprod(AX)) se <- sqrt(pmax(diag(cov_b), 0)) - list(estimate = beta, se = se, lambda = lam, sigma2 = sigma2, - n = n, method = "SAR error (ML, concentrated log-likelihood)") + list( + estimate = beta, se = se, lambda = lam, sigma2 = sigma2, + n = n, method = "SAR error (ML, concentrated log-likelihood)" + ) } # CANONICAL TEST (with row-standardised path graph) @@ -54,4 +76,4 @@ sarre <- function(x, y, w) { #' @rdname sarre #' @keywords internal #' @export -spatial_ar_error <- sarre +morie_spatial_ar_error <- sarre diff --git a/r-package/morie/R/sglm.R b/r-package/morie/R/sglm.R index 7b40c00b1a..32c38aabff 100644 --- a/r-package/morie/R/sglm.R +++ b/r-package/morie/R/sglm.R @@ -1,4 +1,25 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: spatial-GLM profile negative log-likelihood in log(phi). +# Extracted from the sglm() optimiser closure so the non-positive- +# -definite-covariance guard is directly unit-testable. `D` is the +# distance matrix, `n` the sample size, `X`/`y` the design and response. +.sglm_negll <- function(log_phi, D, n, X, y) { + phi <- exp(log_phi) + R <- exp(-D / phi) + 1e-8 * diag(n) + L <- tryCatch(chol(R), error = function(e) NULL) + if (is.null(L)) { + return(1e12) + } + Xw <- backsolve(L, X, transpose = TRUE) + yw <- backsolve(L, y, transpose = TRUE) + beta <- solve(crossprod(Xw), crossprod(Xw, yw)) + resid <- yw - Xw %*% beta + sigma2 <- sum(resid^2) / n + logdet_R <- 2 * sum(log(diag(L))) + 0.5 * (n * log(2 * pi * sigma2) + logdet_R + n) +} + #' Spatial GLM (Gaussian-identity case via profile ML). #' #' Y = X beta + delta + eps, delta ~ GP(0, sigma2 R_phi), R_phi exponential. @@ -11,37 +32,31 @@ #' @return Named list: estimate, se, sigma2, phi, tau2, n, method. #' @references Schabenberger & Gotway (2005), Ch 5. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' sglm(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export sglm <- function(x, y, coords, family = "gaussian") { - if (family != "gaussian") + if (family != "gaussian") { stop("sglm: family=", family, " needs PQL/Laplace; tracker for v0.3.0") - X <- as.matrix(x); y <- as.numeric(y); n <- length(y) - coords <- if (is.matrix(coords)) coords else + } + X <- as.matrix(x) + y <- as.numeric(y) + n <- length(y) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } p <- ncol(X) - if (nrow(X) != n || nrow(coords) != n) + if (nrow(X) != n || nrow(coords) != n) { stop("shape mismatch among x, y, coords") + } D <- as.matrix(stats::dist(coords)) h_max <- max(D) - neg_ll <- function(log_phi) { - phi <- exp(log_phi) - R <- exp(-D / phi) + 1e-8 * diag(n) - L <- tryCatch(chol(R), error = function(e) NULL) - if (is.null(L)) return(1e12) - Xw <- backsolve(L, X, transpose = TRUE) - yw <- backsolve(L, y, transpose = TRUE) - beta <- solve(crossprod(Xw), crossprod(Xw, yw)) - resid <- yw - Xw %*% beta - sigma2 <- sum(resid ^ 2) / n - logdet_R <- 2 * sum(log(diag(L))) - 0.5 * (n * log(2 * pi * sigma2) + logdet_R + n) - } - res <- stats::optimize(neg_ll, interval = c(log(max(h_max / 100, 1e-3)), - log(max(h_max * 3, 1)))) + neg_ll <- function(log_phi) .sglm_negll(log_phi, D, n, X, y) + res <- stats::optimize(neg_ll, interval = c( + log(max(h_max / 100, 1e-3)), + log(max(h_max * 3, 1)) + )) phi <- exp(res$minimum) R <- exp(-D / phi) + 1e-8 * diag(n) L <- chol(R) @@ -50,11 +65,13 @@ sglm <- function(x, y, coords, family = "gaussian") { XtSiX <- crossprod(Xw) beta <- as.numeric(solve(XtSiX, crossprod(Xw, yw))) resid <- yw - Xw %*% beta - sigma2 <- as.numeric(sum(resid ^ 2) / max(n - p, 1)) + sigma2 <- as.numeric(sum(resid^2) / max(n - p, 1)) se_beta <- sqrt(diag(sigma2 * solve(XtSiX))) - list(estimate = beta, se = se_beta, sigma2 = sigma2, phi = phi, - tau2 = 0, n = n, - method = "Spatial GLM (Gaussian, exponential covariance, ML)") + list( + estimate = beta, se = se_beta, sigma2 = sigma2, phi = phi, + tau2 = 0, n = n, + method = "Spatial GLM (Gaussian, exponential covariance, ML)" + ) } # CANONICAL TEST @@ -63,4 +80,4 @@ sglm <- function(x, y, coords, family = "gaussian") { #' @rdname sglm #' @keywords internal #' @export -spatial_glm <- sglm +morie_spatial_glm <- sglm diff --git a/r-package/morie/R/sgnpw.R b/r-package/morie/R/sgnpw.R index 5ba1bf5a4f..88368140d5 100644 --- a/r-package/morie/R/sgnpw.R +++ b/r-package/morie/R/sgnpw.R @@ -13,36 +13,42 @@ #' k_lower, k_upper. #' @importFrom stats dbinom #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_sign_test_power(x = rnorm(50)) #' @export -sign_test_power <- function(x, mu0 = 0, p_alt = 0.7, alpha = 0.05) { +morie_sign_test_power <- function(x, mu0 = 0, p_alt = 0.7, alpha = 0.05) { x <- as.numeric(x) n <- sum(x != mu0) if (n < 1 || !(p_alt > 0 && p_alt < 1)) { - return(list(statistic = NA_real_, n = n, p_alt = p_alt, - alpha = alpha, - method = "Sign-test power")) + return(list( + statistic = NA_real_, n = n, p_alt = p_alt, + alpha = alpha, + method = "Sign-test power" + )) } k_grid <- 0:n null_pmf <- stats::dbinom(k_grid, n, 0.5) ord <- order(null_pmf) - cum <- 0; reject <- logical(n + 1) + cum <- 0 + reject <- logical(n + 1) for (k in ord) { - if (cum + null_pmf[k + 1] <= alpha) { - reject[k + 1] <- TRUE - cum <- cum + null_pmf[k + 1] - } else break + if (cum + null_pmf[k] <= alpha) { + reject[k] <- TRUE + cum <- cum + null_pmf[k] + } else { + break + } } size <- cum if (!any(reject)) { - return(list(statistic = 0, n = n, p_alt = p_alt, - alpha = alpha, size = 0, - method = "Sign-test power", - warnings = sprintf("No rejection region of size <= %g for n=%d", - alpha, n))) + return(list( + statistic = 0, n = n, p_alt = p_alt, + alpha = alpha, size = 0, + method = "Sign-test power", + warnings = sprintf( + "No rejection region of size <= %g for n=%d", + alpha, n + ) + )) } alt_pmf <- stats::dbinom(k_grid, n, p_alt) power <- sum(alt_pmf[reject]) diff --git a/r-package/morie/R/signal.R b/r-package/morie/R/signal.R index fb9fc67fea..556bcc7240 100644 --- a/r-package/morie/R/signal.R +++ b/r-package/morie/R/signal.R @@ -14,10 +14,10 @@ #' \donttest{ #' if (requireNamespace("signal", quietly = TRUE)) { #' set.seed(1) -#' t <- seq(0, 1, length.out = 500) -#' x <- sin(2 * pi * 5 * t) + 0.5 * sin(2 * pi * 60 * t) # 5 Hz + 60 Hz -#' y <- buttlp(x, fs = 500, cutoff = 20) -#' length(y$filtered) # 500 +#' t <- seq(0, 1, length.out = 500) +#' x <- sin(2 * pi * 5 * t) + 0.5 * sin(2 * pi * 60 * t) # 5 Hz + 60 Hz +#' y <- buttlp(x, fs = 500, cutoff = 20) +#' length(y$filtered) # 500 #' } #' } buttlp <- function(x, fs, cutoff, order = 4L) { @@ -46,7 +46,7 @@ buttlp <- function(x, fs, cutoff, order = 4L) { #' if (requireNamespace("signal", quietly = TRUE)) { #' set.seed(1) #' t <- seq(0, 1, length.out = 500) -#' x <- 5 * t + sin(2 * pi * 10 * t) # linear drift + 10 Hz signal +#' x <- 5 * t + sin(2 * pi * 10 * t) # linear drift + 10 Hz signal #' y <- butthp(x, fs = 500, cutoff = 1) #' length(y$filtered) #' } @@ -79,7 +79,7 @@ butthp <- function(x, fs, cutoff, order = 4L) { #' t <- seq(0, 1, length.out = 1000) #' # 2 Hz drift + 10 Hz band of interest + 60 Hz noise #' x <- sin(2 * pi * 2 * t) + sin(2 * pi * 10 * t) + -#' 0.3 * sin(2 * pi * 60 * t) +#' 0.3 * sin(2 * pi * 60 * t) #' y <- buttbp(x, fs = 1000, low = 5, high = 20) #' length(y$filtered) #' } @@ -111,7 +111,7 @@ buttbp <- function(x, fs, low, high, order = 4L) { #' set.seed(1) #' t <- seq(0, 1, length.out = 1000) #' x <- sin(2 * pi * 10 * t) + sin(2 * pi * 60 * t) -#' y <- buttbs(x, fs = 1000) # remove 60 Hz mains +#' y <- buttbs(x, fs = 1000) # remove 60 Hz mains #' length(y$filtered) #' } #' } @@ -141,11 +141,11 @@ buttbs <- function(x, fs, low = 59, high = 61, order = 4L) { #' set.seed(1) #' t <- seq(0, 1, length.out = 200) #' x <- sin(2 * pi * 3 * t) + rnorm(200, sd = 0.2) -#' y <- sgolay_smooth(x, window_length = 11, polyorder = 3) +#' y <- morie_sgolay_smooth(x, window_length = 11, polyorder = 3) #' length(y$filtered) #' } #' } -sgolay_smooth <- function(x, window_length = 11L, polyorder = 3L) { +morie_sgolay_smooth <- function(x, window_length = 11L, polyorder = 3L) { if (requireNamespace("signal", quietly = TRUE)) { sg <- signal::sgolay(p = polyorder, n = window_length) y <- signal::filter(sg, x) @@ -169,16 +169,17 @@ sgolay_smooth <- function(x, window_length = 11L, polyorder = 3L) { #' \donttest{ #' if (requireNamespace("pracma", quietly = TRUE)) { #' set.seed(1) -#' x <- cumsum(rnorm(2048)) # Brownian motion, expected H ~ 0.5 -#' res <- hurst_r(x) +#' x <- cumsum(rnorm(2048)) # Brownian motion, expected H ~ 0.5 +#' res <- morie_hurst_r(x) #' res$interpretation #' } #' } -hurst_r <- function(x) { +morie_hurst_r <- function(x) { if (requireNamespace("pracma", quietly = TRUE)) { result <- pracma::hurstexp(x, display = FALSE) return(list(H = result$Hs, interpretation = ifelse(result$Hs > 0.55, "persistent", - ifelse(result$Hs < 0.45, "anti-persistent", "random")))) + ifelse(result$Hs < 0.45, "anti-persistent", "random") + ))) } .morie_py_call("hurst", x) } @@ -220,24 +221,24 @@ hfd <- function(x, kmax = 10L) { #' \donttest{ #' if (requireNamespace("signal", quietly = TRUE)) { #' set.seed(1) -#' x <- rnorm(2000) # 1 second of white-noise PCG-like input -#' y <- pcg_filter(x) +#' x <- rnorm(2000) # 1 second of white-noise PCG-like input +#' y <- morie_pcg_filter(x) #' length(y$filtered) #' } #' } -pcg_filter <- function(x, fs = 2000, low = 25, high = 400) { +morie_pcg_filter <- function(x, fs = 2000, low = 25, high = 400) { buttbp(x, fs, low, high) } .morie_py_call <- function(fn_name, ...) { args <- list(...) - arg_str <- paste(sapply(args, function(a) { + arg_str <- paste(vapply(args, function(a) { if (is.numeric(a) && length(a) > 1) { paste0("[", paste(a, collapse = ","), "]") } else { as.character(a) } - }), collapse = " ") + }, character(1)), collapse = " ") cmd <- paste(fn_name, arg_str) out <- system2("python3", c("-m", "morie.stat_bridge", "exec", cmd), stdout = TRUE, stderr = TRUE) paste(out, collapse = "\n") diff --git a/r-package/morie/R/siu.R b/r-package/morie/R/siu.R new file mode 100644 index 0000000000..ebc6dffb0d --- /dev/null +++ b/r-package/morie/R/siu.R @@ -0,0 +1,2211 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# siu.R -- orchestration for the all-C/C++ Ontario SIU parser. +# +# The HTTP transport (libcurl) and the 64-column HTML parsing both live +# in src/siu_parser.cpp. This file drives them: discover the published +# director's-report id range, concurrently fetch and parse every +# report page, fetch and parse the news releases they link, join the +# two, and write the 64-column SIU.csv. + +# Highest director's-report id to fetch. Discovered from the SIU site's +# incremental index endpoint (its newest row is the highest indexed id), +# plus a margin: a report finalised after the index was last built can +# sit at a drid just above the newest indexed one, so iterating a little +# past it guarantees those are captured. Ids with no report parse to +# blank rows that are dropped, so the margin is free. +# +# As of 2026-05 the live max sits around drid ~5100; the default margin +# of 300 gives substantial headroom for reports added between manifest +# refreshes, and the `default` fallback (used only when the discovery +# endpoint is unreachable) is set to 6000 so cold-start sweeps still +# capture everything currently published. +.siu_discover_max_drid <- function(default = 6000L, margin = 300L) { + html <- tryCatch( + .siu_http_get(paste0( + "https://www.siu.on.ca/ssi/get_more_drs.php", + "?lang=en&lastCount=0" + )), + error = function(e) "" + ) + hits <- regmatches(html, gregexpr('id="[0-9]+"', html))[[1L]] + ids <- suppressWarnings(as.integer(gsub("\\D", "", hits))) + ids <- ids[is.finite(ids)] + if (length(ids)) max(ids) + as.integer(margin) else as.integer(default) +} + +#' Fetch the Ontario SIU corpus into a 64-column SIU.csv +#' +#' Fetches and parses the Ontario Special Investigations Unit +#' (police-oversight) corpus -- every director's report and the news +#' releases they link -- into a single CSV with the canonical +#' 64-column schema, one row per case. +#' +#' The parser is implemented entirely in C/C++ (\code{src/siu_parser.cpp}): +#' libcurl drives the HTTP transport and a concurrent \code{curl_multi} +#' pool fetches the ~9,000+ pages, while the 64-field extraction is C++ +#' \code{std::regex} parsing. There is no Python dependency. +#' +#' This is the \emph{Ontario} Special Investigations Unit -- distinct +#' from the federal Structured Intervention Units and from OTIS. The +#' parsed corpus is not shipped with the package; each user runs the +#' parser themselves, which is fair use of public oversight reports. +#' +#' @param cache_dir Output directory. Defaults to a session-scoped +#' subdirectory of \code{\link[base]{tempdir}()} that R cleans up +#' automatically. For persistent cross-session caching pass +#' \code{cache_dir = morie_cache_dir("siu")} instead; see +#' \code{\link{morie_cache_dir}} and \code{\link{morie_cache_clear}}. +#' @param overwrite Logical; if \code{FALSE} and \code{SIU.csv} already +#' exists in \code{cache_dir}, its path is returned without reparsing. +#' @param max_drid Highest director's-report id to fetch. \code{NULL} +#' (default) uses the shipped manifest's max + a small margin, falling +#' back to discovery from the SIU site. +#' @param concurrency Maximum simultaneous HTTP transfers. Default +#' \code{4} is a polite rate paired with \code{rate_rps = 4}; raising +#' either above ~8/8 risks triggering WAF interstitials that return +#' short non-report HTML. +#' @param rate_rps Maximum request starts per second across the pool +#' (token-bucket throttle). Default \code{4} is the rate the package +#' was empirically validated against; lower it on poor connections +#' or contested endpoints. +#' @param use_manifest If \code{TRUE} (default), restrict the sweep to +#' the known-valid drids in the shipped manifest +#' (\code{inst/extdata/siu_drid_manifest.csv.gz}), still topping up +#' with any drid above the manifest's max up to \code{max_drid}. +#' Cuts the fetch by ~30-50 percent on a typical run by skipping holes. +#' @param lang Language filter on the manifest. \code{"all"} (default, +#' back-compat) fetches every known-valid drid -- English and +#' French copies of each case -- and then collapses to one row per +#' case_number with English winning the dedupe. \code{"en"} fetches +#' only the English drids (about half the size of the corpus and +#' half the network round trips); \code{"fr"} fetches only French. +#' Use \code{"en"} for the fastest cold-start when you only need +#' the canonical English text. +#' @param cache_html If \code{TRUE}, gzip and save the raw HTML of +#' every fetched director's-report and news-release page under +#' \code{/html/drid_NNNN.html.gz} and +#' \code{/html/nrid_NNNN.html.gz}. This is the persistent +#' ground truth for every row in the emitted CSV: any later +#' discrepancy between the parser and a human coder can be +#' adjudicated against the saved HTML without re-hitting SIU. Adds +#' ~80-100 MB to \code{cache_dir} for a full run; default +#' \code{FALSE} (the harvester remains lean unless you ask). +#' @param progress Logical; print progress messages. +#' @return Path to the written \code{SIU.csv}. +#' @examples +#' \dontrun{ +#' # Network: parses the full Ontario SIU corpus (~15-25 min at the +#' # default polite rate of 4 RPS). +#' csv <- morie_fetch_siu(cache_dir = tempdir()) +#' siu <- utils::read.csv(csv) +#' nrow(siu) +#' } +#' @export +morie_fetch_siu <- function(cache_dir = file.path(tempdir(), "morie", "siu"), + overwrite = FALSE, max_drid = NULL, + concurrency = 4L, rate_rps = 4.0, + use_manifest = TRUE, + lang = c("all", "en", "fr"), + cache_html = FALSE, + progress = TRUE) { + lang <- match.arg(lang) + cache_dir <- path.expand(cache_dir) + dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + html_dir <- file.path(cache_dir, "html") + if (cache_html) { + dir.create(html_dir, + recursive = TRUE, + showWarnings = FALSE + ) + } + out_path <- file.path(cache_dir, "SIU.csv") + if (file.exists(out_path) && !overwrite) { + return(out_path) + } + + manifest <- if (use_manifest) .siu_load_manifest() else NULL + + # Always probe past the *live* max so new reports added since the + # manifest snapshot are captured. The manifest is a *floor* on the + # known-valid id space, not a ceiling on what we sweep. + if (is.null(max_drid)) { + live_max <- .siu_discover_max_drid() + manifest_max <- if (!is.null(manifest)) { + max(manifest$drid, na.rm = TRUE) + } else { + 0L + } + max_drid <- max(live_max, manifest_max + 300L) + } + max_drid <- as.integer(max_drid) + base_r <- "https://www.siu.on.ca/en/directors_report_details.php?drid=" + base_n <- "https://www.siu.on.ca/en/news_template.php?nrid=" + + # -- 1. fetch and parse every director's-report page -- + if (!is.null(manifest)) { + # Known-valid drids from the manifest, optionally filtered by + # language to save half the fetch when the user only needs EN + # (or only FR). The manifest's _language column was populated + # at refresh time by the parser's own language detector. + in_lang <- if (lang == "all") { + rep(TRUE, nrow(manifest)) + } else { + manifest[["_language"]] %in% c(lang, "unknown") + } + manifest_max <- max(manifest$drid[in_lang], na.rm = TRUE) + drids_below <- manifest$drid[in_lang & manifest$drid <= max_drid] + # Above the manifest's max we don't yet know each new drid's + # language; probe all of them and let parser drop non-matching + # rows at dedupe time. + drids_above <- if (max_drid > manifest_max) { + seq.int(manifest_max + 1L, max_drid) + } else { + integer(0) + } + drids <- sort(unique(c(drids_below, drids_above))) + } else { + drids <- seq_len(max_drid) + } + if (progress) { + message( + "SIU: fetching ", length(drids), " report pages + ", + "news releases (concurrency=", concurrency, + ", rate=", rate_rps, " req/s, interleaved batches) ..." + ) + } + + # -- 1+2. Interleaved fetch: each batch fires this batch's reports + # in the SAME rate-limited pool as the PREVIOUS batch's news + # releases. While the next 250 reports are downloading, the news + # pages whose nrids we just parsed are downloading alongside. + # Roughly halves wall time vs the prior two-phase serial flow, + # without changing the per-second rate the SIU site sees. + rep_html_all <- character(length(drids)) + news_html_by_nrid <- list() + pending_nrids <- character(0) + rep_rows_acc <- list() + batch_size <- 250L + batches <- split( + seq_along(drids), + ceiling(seq_along(drids) / batch_size) + ) + + for (bi in seq_along(batches)) { + idx <- batches[[bi]] + rep_urls <- paste0(base_r, drids[idx]) + news_urls <- if (length(pending_nrids)) { + paste0(base_n, pending_nrids) + } else { + character(0) + } + combined <- c(rep_urls, news_urls) + if (progress) { + message(sprintf( + "SIU batch %d/%d: %d reports + %d news ...", + bi, length(batches), + length(rep_urls), length(news_urls) + )) + } + results <- .siu_http_get_many( + combined, + as.integer(concurrency), + 60L, + as.numeric(rate_rps), + 3L + ) + rep_results <- results[seq_along(rep_urls)] + news_results <- if (length(news_urls)) { + results[(length(rep_urls) + 1L):length(results)] + } else { + character(0) + } + + # Persist HTML if requested + if (cache_html) { + for (k in seq_along(idx)) { + if (nzchar(rep_results[k])) { + .siu_write_html_cache( + html_dir, + sprintf("drid_%d.html.gz", drids[idx][k]), + rep_results[k] + ) + } + } + if (length(pending_nrids)) { + for (k in seq_along(pending_nrids)) { + if (nzchar(news_results[k])) { + .siu_write_html_cache( + html_dir, + sprintf( + "nrid_%s.html.gz", + pending_nrids[k] + ), + news_results[k] + ) + } + } + } + } + + # Stow this batch's HTML + rep_html_all[idx] <- rep_results + if (length(pending_nrids)) { + for (k in seq_along(pending_nrids)) { + news_html_by_nrid[[pending_nrids[k]]] <- news_results[k] + } + } + + # Parse this batch's reports immediately so we can queue the + # next batch's nrids for fetching alongside the next reports. + batch_rep_rows <- lapply(seq_along(idx), function(k) { + .siu_parse_report( + rep_results[k], drids[idx][k], + paste0(base_r, drids[idx][k]) + ) + }) + rep_rows_acc[idx] <- batch_rep_rows + + # Extract new nrids from this batch's reports + batch_nrids <- vapply( + batch_rep_rows, + function(r) as.character(r[["nrid"]]), + character(1) + ) + batch_nrids <- batch_nrids[nzchar(batch_nrids)] + # Only fetch each nrid once across the whole run + pending_nrids <- setdiff( + unique(batch_nrids), + names(news_html_by_nrid) + ) + } + + # Final cleanup: fetch the last batch's nrids that haven't been + # paired with a next-batch's report fetch yet. + if (length(pending_nrids)) { + if (progress) { + message(sprintf( + "SIU final: %d remaining news pages ...", + length(pending_nrids) + )) + } + final_news <- .siu_http_get_many( + paste0(base_n, pending_nrids), + as.integer(concurrency), + 60L, + as.numeric(rate_rps), + 3L + ) + if (cache_html) { + for (k in seq_along(pending_nrids)) { + if (nzchar(final_news[k])) { + .siu_write_html_cache( + html_dir, + sprintf( + "nrid_%s.html.gz", + pending_nrids[k] + ), + final_news[k] + ) + } + } + } + for (k in seq_along(pending_nrids)) { + news_html_by_nrid[[pending_nrids[k]]] <- final_news[k] + } + } + + # Build the report data frame from the accumulated rows. + df <- as.data.frame( + do.call(rbind, lapply(rep_rows_acc, as.character)), + stringsAsFactors = FALSE + ) + names(df) <- names(rep_rows_acc[[1L]]) + + # -- 3. Parse the news-release HTML we collected during the interleave -- + if (length(news_html_by_nrid)) { + parsed_nrids <- names(news_html_by_nrid) + news_rows <- lapply(seq_along(parsed_nrids), function(i) { + .siu_parse_news( + news_html_by_nrid[[parsed_nrids[i]]], + as.integer(parsed_nrids[i]), + paste0(base_n, parsed_nrids[i]) + ) + }) + news_df <- as.data.frame( + do.call(rbind, lapply(news_rows, as.character)), + stringsAsFactors = FALSE + ) + names(news_df) <- names(news_rows[[1L]]) + + # -- 4. join the news fields onto the report rows by nrid -- + j <- match(df$nrid, news_df$nrid) + hit <- !is.na(j) + for (col in c( + "source_url_news", "news_release_title", + "news_release_date_iso", "news_release_date_raw", + "news_release_summary" + )) { + df[[col]][hit] <- news_df[[col]][j[hit]] + } + } + + # One row per case. Drop pages with no case number (non-existent or + # draft drids), then collapse the English and French copies of a case + # to a single row -- keeping the English one and its drid / nrid, so + # every emitted row is one case identified by one case_number. + df <- df[nzchar(df$case_number), , drop = FALSE] + if (nrow(df) > 0L) { + df <- df[order(df$case_number, df[["_language"]] != "en"), , + drop = FALSE + ] + df <- df[!duplicated(df$case_number), , drop = FALSE] + rownames(df) <- NULL + } + + # SIU pages (notably the French releases) are UTF-8. + for (col in names(df)) Encoding(df[[col]]) <- "UTF-8" + + # Apply canonical overrides last (so they win over any regex + # extraction). Merges the shipped maintainer-verified table with + # the user-side cache_dir/canonical_overrides.csv if present. + # This is how the parser "learns": every verified correction we + # record becomes part of the output on the next run, no C++ build + # needed. + overrides <- .siu_load_canonical_overrides(user_cache_dir = cache_dir) + if (!is.null(overrides)) { + n_before_overrides <- sum(vapply( + seq_len(nrow(overrides)), + function(i) { + case <- overrides$case_number[i] + fld <- overrides$field[i] + if (!fld %in% names(df)) { + return(0L) + } + idx <- which(df$case_number == case) + if (length(idx) == 1L && + df[[fld]][idx] != overrides$verified_value[i]) { + 1L + } else { + 0L + } + }, integer(1) + )) + df <- .siu_apply_canonical_overrides(df, overrides) + if (progress && n_before_overrides > 0L) { + message( + "SIU: applied ", n_before_overrides, + " canonical-override correction(s) ", + "(", nrow(overrides), " total in override table)" + ) + } + } + + utils::write.csv(df, out_path, row.names = FALSE, na = "") + if (progress) { + message( + "SIU: wrote ", nrow(df), " rows (", + sum(nzchar(df$case_number)), " with a case number) to ", + out_path + ) + } + out_path +} + +#' SIU drid → case_number → language index +#' +#' Returns the shipped drid manifest as a data frame -- one row per +#' director's-report id morie has verified, with the parsed case +#' number, detected language, and the canonical drid (the English +#' drid for that case, or the first drid if no English version +#' exists). This is the index \code{morie_fetch_siu()} uses +#' internally; exposing it lets users: +#' +#' \itemize{ +#' \item see exactly which drids ship as known-valid (no need +#' to fetch to find out); +#' \item subset to English-only or French-only case lists +#' without running the full harvester; +#' \item map between drid (URL fragment) and case_number (SIU's +#' own identifier) offline. +#' } +#' +#' The manifest is refreshed by maintainers via +#' \code{morie_siu_refresh_manifest()}; it ships gzipped under +#' \code{inst/extdata/} at ~50 KB. +#' +#' @param lang Filter rows by detected language. \code{"all"} +#' (default) returns every entry; \code{"en"} returns only the +#' English drids; \code{"fr"} returns only French; \code{"valid"} +#' returns every drid whose case_number was successfully parsed +#' (drops blank / draft drids). +#' @param canonical_only If \code{TRUE}, returns one row per +#' case_number (the canonical drid for that case, English +#' preferred). Useful when you want a unique-cases index. +#' @return A data frame with columns \code{drid}, \code{http_code}, +#' \code{body_bytes}, \code{attempts}, \code{case_number}, +#' \code{_language}, \code{source}, \code{retrieved_at_utc}, +#' \code{canonical_drid}. +#' @examples +#' idx <- morie_siu_index(lang = "en") +#' head(idx) +#' # How many drids are English vs French vs unknown? +#' table(morie_siu_index()$`_language`) +#' # Unique-case index (English-preferred) +#' canon <- morie_siu_index(canonical_only = TRUE) +#' nrow(canon) +#' @export +morie_siu_index <- function(lang = c("all", "en", "fr", "valid"), + canonical_only = FALSE) { + lang <- match.arg(lang) + m <- .siu_load_manifest_raw() + if (is.null(m)) { + stop("No shipped SIU drid manifest available. The package ", + "build is missing inst/extdata/siu_drid_manifest.csv.gz.", + call. = FALSE + ) + } + if (lang == "valid") { + m <- m[nzchar(m$case_number), , drop = FALSE] + } else if (lang %in% c("en", "fr")) { + m <- m[m[["_language"]] == lang, , drop = FALSE] + } + if (canonical_only && "canonical_drid" %in% names(m)) { + m <- m[!is.na(m$canonical_drid) & m$drid == m$canonical_drid, , + drop = FALSE + ] + } + rownames(m) <- NULL + m +} + +# Internal: read the unfiltered shipped manifest. Returns NULL on +# any failure. Unlike .siu_load_manifest() (which restricts to +# healthy 200s for harvester use), this returns ALL columns + rows +# so morie_siu_index() can serve the full table. +.siu_load_manifest_raw <- function() { + p <- system.file("extdata", "siu_drid_manifest.csv.gz", + package = "morie" + ) + if (!nzchar(p) || !file.exists(p)) { + return(NULL) + } + tryCatch( + utils::read.csv(gzfile(p), + colClasses = "character", + check.names = FALSE + ), + error = function(e) NULL + ) +} + +# Internal: read the shipped canonical override table if present. +# This is a (case_number, field, verified_value) long table -- one row +# per cell we've verified against the report HTML, by maintainer +# review or LLM consensus. Applied by morie_fetch_siu() AFTER the +# regex extraction, so the parser "learns": every correction we +# commit propagates to all users on the next package update without +# any C++ rebuild. +# +# The user-side cache at /canonical_overrides.csv mirrors +# the shipped one and is merged in too -- so individual users can +# record their own corrections without touching the package source. +# Maintainer-confirmed corrections get promoted into the shipped +# table; user-side corrections stay local until then. +.siu_load_canonical_overrides <- function(user_cache_dir = NULL) { + read_one <- function(p) { + if (!nzchar(p) || !file.exists(p)) { + return(NULL) + } + df <- tryCatch( + if (endsWith(p, ".gz")) { + utils::read.csv(gzfile(p), + colClasses = "character", + check.names = FALSE + ) + } else { + utils::read.csv(p, colClasses = "character", check.names = FALSE) + }, + error = function(e) NULL + ) + if (is.null(df) || !nrow(df)) { + return(NULL) + } + if (!all(c("case_number", "field", "verified_value") %in% names(df))) { + return(NULL) + } + df + } + shipped <- read_one(system.file("extdata", + "siu_canonical_overrides.csv.gz", + package = "morie" + )) + user <- if (!is.null(user_cache_dir)) { + read_one(file.path( + path.expand(user_cache_dir), + "canonical_overrides.csv" + )) + } else { + NULL + } + if (is.null(shipped) && is.null(user)) { + return(NULL) + } + out <- rbind( + shipped[, c("case_number", "field", "verified_value"), drop = FALSE], + user[, c("case_number", "field", "verified_value"), drop = FALSE] + ) + # User overrides win on conflict: rev() so most-recent insertions + # land at the top of unique(). + out <- out[!duplicated(rev(out)[, c("case_number", "field")]), , + drop = FALSE + ] + out +} + +# Internal: apply a canonical-overrides table to a parsed SIU data +# frame. Each row of `overrides` is (case_number, field, verified_value); +# for any match, overwrite df[[field]] at the row whose case_number +# matches. Silent on misses (override for a case not in the parse, +# or field not in the schema). +.siu_apply_canonical_overrides <- function(df, overrides) { + if (is.null(overrides) || !nrow(overrides)) { + return(df) + } + for (i in seq_len(nrow(overrides))) { + case <- overrides$case_number[i] + fld <- overrides$field[i] + val <- overrides$verified_value[i] + if (!fld %in% names(df)) next + row_idx <- which(df$case_number == case) + if (length(row_idx) == 1L) { + df[[fld]][row_idx] <- val + } + } + df +} + +#' Record a verified correction to the SIU parser's output +#' +#' Saves a (case_number, field, verified_value) tuple to a local +#' overrides CSV at \code{/canonical_overrides.csv}. Every +#' subsequent \code{morie_fetch_siu()} on that \code{cache_dir} will +#' overlay these corrections onto the regex-parsed output. The shipped +#' \code{inst/extdata/siu_canonical_overrides.csv.gz} carries +#' maintainer-confirmed corrections; this function lets users add +#' their own without touching the package source. +#' +#' This is the "memory" of the parser: every wrong cell you find and +#' fix becomes permanent for that cache_dir. Maintainers can submit +#' corrections upstream by sharing the resulting CSV file. +#' +#' @param case_number SIU case number, e.g. \code{"17-OVI-201"}. +#' @param field Name of the column in the SIU schema (e.g. +#' \code{"location_of_call"}). +#' @param verified_value The correct value, verified against the +#' cached HTML (see \code{morie_siu_audit_case()}). +#' @param note Optional one-line note describing the basis for the +#' correction (HTML excerpt, LLM verdict, etc.). +#' @param cache_dir Directory holding the harvester's SIU.csv. +#' @return Invisibly, the path to the updated overrides CSV. +#' @examples +#' \donttest{ +#' # Writes the correction to a temp cache so the example never +#' # touches the per-user cache directory. +#' tmp <- tempfile("morie_siu_"); dir.create(tmp, recursive = TRUE) +#' morie_siu_record_correction( +#' case_number = "17-OVI-201", +#' field = "location_of_call", +#' verified_value = "Clair Road East, City of Guelph", +#' note = "HTML excerpt: 'on Clair Road East in the City of Guelph'", +#' cache_dir = tmp +#' ) +#' unlink(tmp, recursive = TRUE) +#' } +#' @export +morie_siu_record_correction <- function(case_number, field, + verified_value, note = "", + cache_dir = file.path(tempdir(), "morie", "siu")) { + stopifnot( + is.character(case_number), length(case_number) == 1L, + is.character(field), length(field) == 1L, + is.character(verified_value), length(verified_value) == 1L, + field %in% .siu_field_list() + ) + cache_dir <- path.expand(cache_dir) + dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + path <- file.path(cache_dir, "canonical_overrides.csv") + existing <- if (file.exists(path)) { + utils::read.csv(path, colClasses = "character", check.names = FALSE) + } else { + data.frame( + case_number = character(), field = character(), + verified_value = character(), note = character(), + recorded_at_utc = character(), + stringsAsFactors = FALSE + ) + } + # De-dupe: last write wins for any (case, field) pair. + existing <- existing[!(existing$case_number == case_number & + existing$field == field), , drop = FALSE] + new_row <- data.frame( + case_number = case_number, + field = field, + verified_value = verified_value, + note = note, + recorded_at_utc = format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC"), + stringsAsFactors = FALSE + ) + out <- rbind(existing, new_row) + utils::write.csv(out, path, row.names = FALSE) + invisible(path) +} + +# Internal: read the shipped DRID manifest if present. Returns NULL on +# any failure so the harvester degrades gracefully to a full sweep. +.siu_load_manifest <- function() { + p <- system.file("extdata", "siu_drid_manifest.csv.gz", package = "morie") + if (!nzchar(p) || !file.exists(p)) { + return(NULL) + } + m <- tryCatch( + utils::read.csv(gzfile(p), colClasses = c( + drid = "integer", http_code = "integer", body_bytes = "integer", + case_number = "character", source = "character", + retrieved_at_utc = "character" + )), + error = function(e) NULL + ) + if (is.null(m) || !nrow(m) || !"drid" %in% names(m)) { + return(NULL) + } + # Keep only confirmed-valid rows (200 OK + non-trivial body + parsed + # case_number). These are the drids worth re-fetching on a run; the + # rest are deliberately skipped to save bandwidth. + m <- m[is.finite(m$drid) & m$http_code == 200L & + m$body_bytes >= 1000L & nzchar(m$case_number), , drop = FALSE] + if (!nrow(m)) { + return(NULL) + } + m +} + +#' Rebuild the Ontario SIU DRID manifest by probing the live site +#' +#' Sweeps director's-report ids \code{1..max_drid} and writes a small +#' CSV recording which ids return a healthy report page, the parsed +#' case number, and the response body size. The harvester +#' (\code{morie_fetch_siu}) then uses this manifest to short-circuit +#' the ~30-50 percent of ids that have no report, saving bandwidth and +#' WAF-trigger risk on every run. +#' +#' The shipped manifest at \code{inst/extdata/siu_drid_manifest.csv.gz} +#' is a snapshot. Users who want the latest can call this function; +#' it is also how morie maintainers regenerate the snapshot. +#' +#' @param out_path Path to write the gzipped CSV. Default is the +#' in-place manifest location (only useful for maintainers building +#' from a source checkout). +#' @param max_drid Highest drid to probe. Default \code{NULL} +#' auto-discovers from the SIU index endpoint and adds a margin. +#' @param min_drid Lowest drid to probe (default \code{1L}). +#' @param concurrency Maximum simultaneous transfers (default \code{4}). +#' @param rate_rps Maximum request starts per second (default \code{4}). +#' @param progress Logical; print a per-batch progress line. +#' @return Invisibly, a data frame of the full sweep (every probed drid, +#' including misses), parallel to what was written to \code{out_path}. +#' @examples +#' \dontrun{ +#' # Network: refreshes the manifest by probing the SIU site +#' # (~25-40 min at the default polite rate of 4 RPS for ~6000 ids). +#' df <- morie_siu_refresh_manifest(out_path = tempfile(fileext = ".csv.gz")) +#' table(df$http_code) +#' } +#' @export +morie_siu_refresh_manifest <- function( + out_path = NULL, max_drid = NULL, min_drid = 1L, + concurrency = 4L, rate_rps = 4.0, progress = TRUE +) { + # Manifest refresh sweeps a generous range so the resulting snapshot + # stays useful for several months without re-probing. Default is + # max(live-discovery + margin, 6000) — the live max currently sits + # around drid ~5100, and 6000 gives headroom for ~one year of new + # reports at the SIU's historical publish cadence. + if (is.null(max_drid)) max_drid <- max(.siu_discover_max_drid(), 6000L) + min_drid <- as.integer(min_drid) + max_drid <- as.integer(max_drid) + stopifnot(min_drid >= 1L, max_drid >= min_drid) + + base_r <- "https://www.siu.on.ca/en/directors_report_details.php?drid=" + drids <- seq.int(min_drid, max_drid) + if (progress) { + message( + "SIU manifest: probing ", length(drids), " drids ", + "[", min_drid, "..", max_drid, "] at ", + rate_rps, " req/s ..." + ) + } + res <- .siu_http_get_many_with_status( + paste0(base_r, drids), + as.integer(concurrency), + 60L, + as.numeric(rate_rps), + 3L + ) + + # Parse case_number from each body so the manifest can ship the + # canonical id mapping. Empty body => empty case_number. + case_no <- vapply(res$body, function(html) { + if (!nzchar(html) || nchar(html) < 1000L) { + return("") + } + parsed <- tryCatch(.siu_parse_report(html, 0L, ""), + error = function(e) NULL + ) + if (is.null(parsed)) "" else as.character(parsed[["case_number"]]) + }, character(1), USE.NAMES = FALSE) + + df <- data.frame( + drid = drids, + http_code = as.integer(res$http_code), + body_bytes = as.integer(nchar(res$body)), + attempts = as.integer(res$attempts), + case_number = case_no, + source = "siu.on.ca", + retrieved_at_utc = format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", + tz = "UTC" + ), + stringsAsFactors = FALSE + ) + + if (!is.null(out_path)) { + gz <- gzfile(out_path, "w") + utils::write.csv(df, gz, row.names = FALSE) + close(gz) + if (progress) { + ok <- sum(df$http_code == 200L & df$body_bytes >= 1000L) + message( + "SIU manifest: wrote ", nrow(df), " rows (", + ok, " healthy 200s) to ", out_path + ) + } + } + invisible(df) +} + +# Internal: write one HTML page to /, gzipped. Called +# from morie_fetch_siu() when cache_html = TRUE. +.siu_write_html_cache <- function(html_dir, name, html) { + con <- gzfile(file.path(html_dir, name), "w") + on.exit(close(con), add = TRUE) + writeChar(html, con, eos = NULL) +} + +# Internal: read a gzipped cached HTML page if it exists, else "". +.siu_read_html_cache <- function(html_dir, name) { + p <- file.path(html_dir, name) + if (!file.exists(p)) { + return("") + } + con <- gzfile(p, "rb") + on.exit(close(con), add = TRUE) + bytes <- readBin(con, "raw", n = file.info(p)$size * 50L) + rawToChar(bytes) +} + +#' Audit one SIU case end-to-end: parser output + raw HTML +#' +#' For any case_number (or drid), return the parser's 64-column row +#' together with the raw HTML pages it was extracted from -- the +#' director's-report page and, when linked, the news-release page. +#' This is the per-row ground truth: every field in the emitted CSV +#' is reproducible from \code{report_html} via the parser, and any +#' disagreement with another data source can be adjudicated against +#' the saved HTML. +#' +#' Reads from the local cache at \code{/html/} (populated +#' by \code{morie_fetch_siu(cache_html = TRUE)}) when available, and +#' falls back to a polite live fetch when the cache is missing. +#' +#' @param case_number An SIU case number (e.g. \code{"17-OVI-201"}), +#' or an integer drid. +#' @param cache_dir Directory holding the harvester's SIU.csv and the +#' optional \code{html/} subdirectory. +#' @param fetch_if_missing If \code{TRUE} (default), fetch the page +#' from SIU when the local cache misses. Set \code{FALSE} to work +#' strictly from the cache. +#' @return A list with elements \code{row} (the parser's 1-row data +#' frame for this case), \code{drid}, \code{nrid}, +#' \code{report_html}, \code{news_html}, \code{report_text} +#' (HTML-stripped plain text of the report) and \code{news_text}. +#' @examples +#' \dontrun{ +#' a <- morie_siu_audit_case( +#' "17-OVI-201", +#' cache_dir = file.path(tempdir(), "morie", "siu") +#' ) +#' cat(substr(a$report_text, 1, 1000), "\n") +#' } +#' @export +morie_siu_audit_case <- function(case_number, + cache_dir = file.path(tempdir(), "morie", "siu"), + fetch_if_missing = TRUE) { + cache_dir <- path.expand(cache_dir) + html_dir <- file.path(cache_dir, "html") + csv_path <- file.path(cache_dir, "SIU.csv") + if (!file.exists(csv_path)) { + stop("No SIU.csv at '", csv_path, "'; run morie_fetch_siu() first.", + call. = FALSE + ) + } + df <- utils::read.csv(csv_path, + colClasses = "character", + check.names = FALSE + ) + # Caller may pass a case_number string OR a numeric drid. + if (is.numeric(case_number)) { + drid <- as.integer(case_number) + row <- df[df$drid == as.character(drid), , drop = FALSE] + } else { + row <- df[df$case_number == case_number, , drop = FALSE] + } + if (!nrow(row)) { + stop("No row found for '", case_number, "' in ", csv_path, + call. = FALSE + ) + } + drid <- as.integer(row$drid[1L]) + nrid <- suppressWarnings(as.integer(row$nrid[1L])) + + fetch_one <- function(url) { + tryCatch(.siu_http_get(url), error = function(e) "") + } + + report_html <- .siu_read_html_cache( + html_dir, + sprintf("drid_%d.html.gz", drid) + ) + if (!nzchar(report_html) && fetch_if_missing) { + report_html <- fetch_one(paste0( + "https://www.siu.on.ca/en/directors_report_details.php?drid=", + drid + )) + } + + news_html <- "" + if (!is.na(nrid)) { + news_html <- .siu_read_html_cache( + html_dir, + sprintf("nrid_%d.html.gz", nrid) + ) + if (!nzchar(news_html) && fetch_if_missing) { + news_html <- fetch_one(paste0( + "https://www.siu.on.ca/en/news_template.php?nrid=", nrid + )) + } + } + + list( + row = row, + drid = drid, + nrid = nrid, + report_html = report_html, + news_html = news_html, + report_text = .siu_html_to_text(report_html), + news_text = .siu_html_to_text(news_html) + ) +} + +# Internal: R-side HTML-to-text helper. Strips tags + decodes the +# most common entities so reports + news releases can be displayed +# as plain text. Mirrors the C++ html_to_text() but with the safer +# linear single-pass approach (no std::regex backtracking risk). +.siu_html_to_text <- function(h) { + if (!nzchar(h)) { + return("") + } + # Drop and chunks first. + h <- gsub("(?is)]*>.*?", " ", h, perl = TRUE) + h <- gsub("(?is)]*>.*?", " ", h, perl = TRUE) + h <- gsub("<[^>]+>", " ", h, perl = TRUE) + # Common named entities (small set covering ~99% of SIU pages). + ents <- c( + "&" = "&", "<" = "<", ">" = ">", """ = "\"", + "'" = "'", "'" = "'", " " = " ", + "’" = "'", "‘" = "'", "“" = "\"", + "”" = "\"", "–" = "-", "—" = "-", + "…" = "..." + ) + for (k in names(ents)) h <- gsub(k, ents[[k]], h, fixed = TRUE) + # Numeric entities (decimal + hex). + h <- gsub("&#([0-9]+);", "\\1", + h, + perl = TRUE + ) # leaves digits; cheap fallback + h <- gsub("\\s+", " ", h, perl = TRUE) + trimws(h) +} + +#' Field-by-field SIU comparison against a user-supplied external table +#' +#' For one case_number, line up the parser's value against the same +#' field in a user-supplied external data source -- and, critically, +#' show the surrounding report HTML so the user can adjudicate any +#' disagreement against the actual source document. +#' +#' \strong{The ground truth is the SIU director's-report HTML +#' itself.} The HTML is what the SIU published; the parser's job is +#' to extract structured fields from it faithfully, and any field's +#' correctness is decidable by reading the cached HTML for that +#' case. Any external reference -- a hand-coded survey, an +#' independently-scraped CSV, a colleague's analysis -- is just +#' another extraction attempt, possibly with its own errors. This +#' function does not endorse any external source; it only displays +#' both side-by-side with the HTML excerpt so you can decide. +#' +#' The default field map covers the common SIU-extraction column +#' layout (\code{Q1 = case_number}, \code{Q3 = police_service}, +#' \code{Q4 = number_of_officers_involved}, ...). Pass a custom +#' \code{field_map} for any other external schema. +#' +#' @param case_number A case number (e.g. \code{"17-OVI-201"}). +#' @param external A data frame of external answers, OR a path to an +#' \code{.xlsx} file (read with \code{readxl}). Must contain a +#' column whose values match SIU case numbers (default +#' \code{external_case_col = "Q1"}). +#' @param field_map A named list mapping external-column names to +#' morie field names. +#' @param external_case_col Name of the external column carrying the +#' case-number key. +#' @param cache_dir Directory holding the harvester's SIU.csv and +#' optional cached HTML. +#' @return A data frame with one row per mapped field: \code{field}, +#' \code{parser_value}, \code{external_value}, \code{agree}, and +#' \code{html_excerpt} (a 240-character window around the first +#' occurrence of either value in the cleaned report text). When +#' parser and external disagree, the \code{html_excerpt} is the +#' tie-breaker. +#' @examples +#' \dontrun{ +#' # Caller supplies their own external table; nothing about the +#' # mapping or the file format is canonical to morie. +#' external <- data.frame(case_id = "17-OVI-201", officers = 1L) +#' cmp <- morie_siu_compare( +#' "17-OVI-201", +#' external = external, +#' field_map = list(officers = "number_of_officers_involved"), +#' external_case_col = "case_id" +#' ) +#' subset(cmp, !agree) +#' } +#' @export +morie_siu_compare <- function(case_number, external, + field_map = NULL, + external_case_col = "Q1", + cache_dir = file.path(tempdir(), "morie", "siu")) { + if (is.null(field_map)) { + # Convenience default for the most common SIU-extraction column + # naming. Override with your own field_map for any other schema. + # No endorsement of any particular external source is intended. + field_map <- list( + Q1 = "case_number", + Q3 = "police_service", + Q4 = "number_of_officers_involved", + Q5 = "location_of_call", + Q9 = "number_of_affected_persons", + Q10 = "sex_gender_affected", + Q11 = "age_affected", + Q14 = "number_of_civilian_witnesses", + Q16 = "number_of_subject_officials", + Q19 = "number_of_witness_officials", + Q26 = "charges_recommended" + ) + } + + if (is.character(external) && length(external) == 1L && + file.exists(external)) { + if (!requireNamespace("readxl", quietly = TRUE)) { + stop("Install 'readxl' to read .xlsx, or pass a data frame ", + "instead.", + call. = FALSE + ) + } + external <- as.data.frame(readxl::read_excel(external)) + # Many survey exports prefix data with a row of question prose; + # drop it if the first cell doesn't look like an SIU case number. + if (nrow(external) >= 1L && + !grepl( + "^[0-9]{2}-[A-Z]{3,4}-[0-9]{3}", + as.character(external[[external_case_col]][1L]) + )) { + external <- external[-1L, , drop = FALSE] + } + } + if (!external_case_col %in% names(external)) { + stop("External has no column '", external_case_col, "'.", + call. = FALSE + ) + } + + audit <- morie_siu_audit_case(case_number, cache_dir = cache_dir) + e_row <- external[as.character(external[[external_case_col]]) == + case_number, , drop = FALSE] + if (!nrow(e_row)) { + stop("External has no row for '", case_number, "'.", call. = FALSE) + } + + text <- audit$report_text + excerpt_for <- function(needle) { + if (!nzchar(needle) || !nzchar(text)) { + return("") + } + p <- regexpr(needle, text, fixed = TRUE) + if (p < 0L) { + return("") + } + start <- max(1L, p - 80L) + end <- min(nchar(text), p + nchar(needle) + 80L) + substr(text, start, end) + } + norm <- function(x) { + x <- trimws(as.character(x)) + x <- gsub("\\.0+$", "", x) + tolower(x) + } + + rows <- lapply(names(field_map), function(k) { + fld <- field_map[[k]] + if (!(k %in% names(e_row)) || !(fld %in% names(audit$row))) { + return(NULL) + } + pv <- as.character(audit$row[[fld]][1L]) + ev <- as.character(e_row[[k]][1L]) + data.frame( + field = fld, + parser_value = pv, + external_value = ev, + agree = norm(pv) == norm(ev), + html_excerpt = if (nzchar(pv)) { + excerpt_for(pv) + } else { + excerpt_for(ev) + }, + stringsAsFactors = FALSE + ) + }) + out <- do.call(rbind, rows) + rownames(out) <- NULL + out +} + +# =========================================================================== +# LLM-assisted SIU extraction. Optional helpers that send a cached SIU +# director's-report HTML page through a large-language-model endpoint +# and return the same 64-column row format (for morie_siu_llm_extract) +# or per-field "does this extraction match the report?" verdicts (for +# morie_siu_anomaly_check). The cached HTML remains the ground truth; +# the LLM output is just another extraction attempt that can be +# diffed against the C++ parser via morie_siu_compare(). +# +# Providers are configured via env vars so secrets never appear in +# the package or in chat: +# GOOGLE_API_KEY -> Gemini (default; cheapest) +# ANTHROPIC_API_KEY -> Claude +# Both functions hard-fail with a clear message if the relevant env +# var is missing. +# =========================================================================== + +# Internal: minimal provider table. Each entry's `build()` returns a +# fully-resolved request spec (url, headers, body); the dispatcher +# below sends it via httr2 and the `extract()` pulls the model's +# text out of the parsed response. Hand-written rather than pulling +# a heavy LLM client library so morie's dep surface stays tiny +# (httr2 + jsonlite, both in Suggests). +# +# Providers: +# gemini -- closed, paid, fast. env: GOOGLE_API_KEY +# claude -- closed, paid, fast. env: ANTHROPIC_API_KEY +# ollama -- open-weight models env: OLLAMA_HOST (e.g. +# over a local or self- "http://localhost:11434" +# hosted REST endpoint; or any hosted Ollama- +# free/OllamaFreeAPI- compatible base URL), +# compatible. optional OLLAMA_MODEL +# (default "llama3.2:3b") +# Internal: default LLM HTTP timeout in seconds. 600s (10 min) +# accommodates slow CPU-only local inference on a Raspberry Pi. +# Override globally via MORIE_LLM_TIMEOUT_S env var. +.siu_llm_default_timeout <- function() { + v <- Sys.getenv("MORIE_LLM_TIMEOUT_S", unset = "") + t <- suppressWarnings(as.integer(v)) + if (!is.finite(t) || t < 1L) 600L else t +} + +.siu_llm_providers <- function() { + list( + gemini = list( + env_required = "GOOGLE_API_KEY", + build = function(env, prompt) { + list( + url = paste0( + "https://generativelanguage.googleapis.com/v1beta/", + "models/gemini-2.5-flash:generateContent?key=", + env[["GOOGLE_API_KEY"]] + ), + headers = list("content-type" = "application/json"), + body = list( + contents = list(list(parts = list(list(text = prompt)))), + generationConfig = list( + temperature = 0, + response_mime_type = "application/json" + ) + ) + ) + }, + extract = function(resp) { + x <- resp$candidates[[1L]]$content$parts[[1L]]$text + if (is.null(x)) stop("Gemini returned empty text", call. = FALSE) + x + } + ), + claude = list( + env_required = "ANTHROPIC_API_KEY", + build = function(env, prompt) { + list( + url = "https://api.anthropic.com/v1/messages", + headers = list( + "x-api-key" = env[["ANTHROPIC_API_KEY"]], + "anthropic-version" = "2023-06-01", + "content-type" = "application/json" + ), + body = list( + model = "claude-sonnet-4-6", + max_tokens = 8192L, + messages = list(list(role = "user", content = prompt)) + ) + ) + }, + extract = function(resp) { + x <- resp$content[[1L]]$text + if (is.null(x)) stop("Claude returned empty text", call. = FALSE) + x + } + ), + vertex = list( + # Google Cloud Vertex AI Gemini. Cheaper than AI Studio's + # consumer endpoint; routes through a GCP project for billing + # (e.g. against an existing GCP credit). The bearer token is + # whatever you have in VERTEX_ACCESS_TOKEN -- typically the + # output of `gcloud auth print-access-token` on a machine + # where gcloud is signed in to the relevant project. + # + # GCP_PROJECT defaults to "hadesllm" and GCP_LOCATION to + # "us-central1" if unset; override either via env. Model + # defaults to gemini-2.5-flash; override via VERTEX_MODEL. + env_required = "VERTEX_ACCESS_TOKEN", + build = function(env, prompt) { + project <- if (nzchar(Sys.getenv("GCP_PROJECT", ""))) { + Sys.getenv("GCP_PROJECT") + } else { + "hadesllm" + } + location <- if (nzchar(Sys.getenv("GCP_LOCATION", ""))) { + Sys.getenv("GCP_LOCATION") + } else { + "us-central1" + } + model <- if (nzchar(Sys.getenv("VERTEX_MODEL", ""))) { + Sys.getenv("VERTEX_MODEL") + } else { + "gemini-2.5-flash" + } + list( + url = sprintf( + paste0( + "https://%s-aiplatform.googleapis.com/v1/projects/%s", + "/locations/%s/publishers/google/models/%s", + ":generateContent" + ), + location, project, location, model + ), + headers = list( + "authorization" = + paste("Bearer", env[["VERTEX_ACCESS_TOKEN"]]), + "content-type" = "application/json" + ), + body = list( + contents = list(list( + role = "user", + parts = list(list(text = prompt)) + )), + generationConfig = list( + temperature = 0, + responseMimeType = "application/json" + ) + ) + ) + }, + extract = function(resp) { + x <- resp$candidates[[1L]]$content$parts[[1L]]$text + if (is.null(x)) stop("Vertex returned empty text", call. = FALSE) + x + } + ), + ollama = list( + # OLLAMA_HOST is the documented env var, but if unset we silently + # try a local daemon at http://localhost:11434 -- that's the + # zero-config path for a user who's just installed `ollama` and + # pulled `gemma3:4b` (or any Gemma / Functiongemma variant). No + # API key, no paid subscription, no signup needed. + # + # On CPU-only hardware (e.g. Raspberry Pi 5) the default + # gemma3:4b model is slow (~3 tok/sec); set OLLAMA_MODEL to + # gemma3:270m (~290 MB, ~50 tok/sec) for a 10x speedup with + # only modest quality loss. OLLAMA_KEEP_ALIVE keeps the loaded + # model resident across requests so we don't pay the 10s + # cold-start tax per case. + env_required = "OLLAMA_HOST_OR_DEFAULT", + build = function(env, prompt) { + host <- sub("/+$", "", env[["OLLAMA_HOST_OR_DEFAULT"]]) + headers <- list("content-type" = "application/json") + # Optional bearer token, for hosted Ollama-compatible APIs + # (e.g. OllamaFreeAPI gateways) that require auth. Local + # Ollama at localhost:11434 doesn't need it. + api_key <- Sys.getenv("OLLAMA_API_KEY", unset = "") + if (nzchar(api_key)) { + headers[["authorization"]] <- paste("Bearer", api_key) + } + list( + url = paste0(host, "/api/generate"), + headers = headers, + body = list( + model = if (nzchar(Sys.getenv("OLLAMA_MODEL", ""))) { + Sys.getenv("OLLAMA_MODEL") + } else { + "gemma3:4b" + }, + prompt = prompt, + format = "json", + stream = FALSE, + keep_alive = if (nzchar(Sys.getenv("OLLAMA_KEEP_ALIVE", ""))) { + Sys.getenv("OLLAMA_KEEP_ALIVE") + } else { + "30m" + }, + options = list(temperature = 0, num_ctx = 16384L) + ) + ) + }, + extract = function(resp) { + x <- resp$response + if (is.null(x)) { + stop("Ollama returned empty response", + call. = FALSE + ) + } + x + } + ) + ) +} + +# Internal: fire ONE LLM request through a single provider. Returns +# the model's raw text reply. Errors propagate to the caller so the +# chain dispatcher below can decide whether to fall back. +# +# Default timeout is 600s (10 min) -- long enough to accommodate +# slow CPU-only local Ollama generation on a Raspberry Pi. Override +# via MORIE_LLM_TIMEOUT_S env var or the timeout_s arg. +.siu_llm_call_one <- function(model, prompt, + timeout_s = .siu_llm_default_timeout()) { + if (!requireNamespace("httr2", quietly = TRUE)) { + stop("LLM helpers require the 'httr2' package: ", + "install.packages('httr2')", + call. = FALSE + ) + } + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("LLM helpers require the 'jsonlite' package", call. = FALSE) + } + providers <- .siu_llm_providers() + if (!model %in% names(providers)) { + stop("Unknown LLM model: '", model, "'. Available: ", + paste(names(providers), collapse = ", "), + call. = FALSE + ) + } + p <- providers[[model]] + # Ollama gets a localhost:11434 default if OLLAMA_HOST is unset -- + # that's the zero-config "install ollama, pull gemma3:4b, done" + # path. All other providers still hard-require their API key env. + if (p$env_required == "OLLAMA_HOST_OR_DEFAULT") { + env_val <- Sys.getenv("OLLAMA_HOST", unset = "") + if (!nzchar(env_val)) env_val <- "http://localhost:11434" + } else { + env_val <- Sys.getenv(p$env_required, unset = "") + if (!nzchar(env_val)) { + stop("Env var '", p$env_required, "' is not set; cannot call ", + model, ". Set it, or use model = \"ollama\" with a local ", + "Ollama daemon for a free zero-config alternative.", + call. = FALSE + ) + } + } + env <- setNames(list(env_val), p$env_required) + req_spec <- p$build(env, prompt) + req <- httr2::request(req_spec$url) + if (!is.null(req_spec$headers)) { + req <- httr2::req_headers(req, !!!req_spec$headers) + } + req <- httr2::req_body_json(req, req_spec$body) + req <- httr2::req_timeout(req, timeout_s) + resp <- httr2::req_perform(req) + parsed <- httr2::resp_body_json(resp) + p$extract(parsed) +} + +# Internal: try `model` in order. The first one whose env var is set +# AND whose request returns without erroring wins. `model` may be a +# character vector for failover (e.g. c("gemini", "ollama")). The +# `mock_response_text` arg exists ONLY so unit tests can exercise +# the surrounding R glue without hitting the network. +.siu_llm_call <- function(model, prompt, + timeout_s = .siu_llm_default_timeout(), + mock_response_text = NULL) { + if (!is.null(mock_response_text)) { + return(mock_response_text) + } + if (!length(model)) { + stop("`model` must be a non-empty character vector", + call. = FALSE + ) + } + errs <- character(0) + for (m in model) { + res <- tryCatch( + .siu_llm_call_one(m, prompt, timeout_s = timeout_s), + error = function(e) structure(conditionMessage(e), class = "err") + ) + if (!inherits(res, "err")) { + return(res) + } + errs <- c(errs, sprintf("[%s] %s", m, as.character(res))) + } + stop("All LLM providers failed:\n ", + paste(errs, collapse = "\n "), + call. = FALSE + ) +} + +# The canonical 64-column SIU schema. Hard-coded so the LLM gets the +# exact field list and order the C++ parser emits. +.siu_field_list <- function() { + c( + "case_number", "drid", "nrid", "source_url_report", "source_url_news", + "scraped_at_utc", "parser_version", "date_of_incident_iso", + "date_of_incident_raw", "time_of_incident_raw", "date_of_injury_iso", + "date_of_injury_raw", "incident_to_injury_raw", "date_siu_notified_iso", + "date_siu_notified_raw", "time_of_notification_raw", "notifying_party", + "notifying_party_other_text", "date_of_director_decision_iso", + "date_of_director_decision_raw", "time_of_director_decision_raw", + "siu_investigators", "siu_forensics_investigators", "police_service", + "number_of_officers_involved", "location_of_call", + "type_of_building_or_scene", "reason_for_interaction", + "injuries_sustained", "injuries_other_text", "specific_injuries", + "location_of_treatment", "number_of_affected_persons", + "sex_gender_affected", "age_affected", "affected_interviewed", + "date_of_affected_interview_iso", "date_of_affected_interview_raw", + "number_of_civilian_witnesses", "date_of_witness_interview_raw", + "number_of_subject_officials", "subject_official_interviewed_or_notes", + "date_of_subject_interview_raw", "number_of_witness_officials", + "date_of_witness_official_interview_raw", "evidence_types", + "evidence_other_text", "evidence_features", "narrative_summary", + "relevant_legislation", "legislation_other_text", + "weapons_or_force_used", "weapons_other_text", "charges_recommended", + "directors_decision_reasonable", "supplemental_materials", + "news_links_extra", "mental_health_or_race_indications", "_language", + "news_release_title", "news_release_date_iso", "news_release_date_raw", + "news_release_summary", "directors_name" + ) +} + +#' Extract SIU report fields with an LLM (Gemini or Claude) +#' +#' Sends the cached director's-report HTML for one case through a +#' large-language-model endpoint and asks it to return the 64-column +#' morie schema as JSON. The result is in the SAME row format as the +#' C++ parser, so it drops straight into \code{morie_siu_compare()} +#' as the \code{external} argument for an independent diff against +#' the parser. +#' +#' The cached HTML remains the ground truth. This function does not +#' claim the LLM is more accurate than the regex parser; it provides +#' a fast second extraction so disagreements between two independent +#' methods (regex vs. LLM) can be flagged for human review against +#' the saved report. +#' +#' Credentials are read from environment variables only -- never +#' hard-coded, never passed as function arguments -- so secrets do +#' not leak into call traces, logs, or scripts. Set +#' \code{GOOGLE_API_KEY} for Gemini, \code{ANTHROPIC_API_KEY} for +#' Claude, or \code{OLLAMA_HOST} (e.g. +#' \code{"http://localhost:11434"} or an OllamaFreeAPI base URL) plus +#' optionally \code{OLLAMA_MODEL} (default \code{"llama3.2:3b"}) for +#' Ollama-compatible open-weight endpoints. +#' +#' @param case_number An SIU case number (e.g. \code{"17-OVI-201"}). +#' @param model One of \code{"ollama"} (default; free, runs locally, +#' zero-config when an Ollama daemon is on \code{localhost:11434}), +#' \code{"gemini"} (paid), or \code{"claude"} (paid). A character +#' vector enables fail-over: the first model whose call succeeds +#' wins. The default \code{c("ollama", "gemini")} tries the local +#' free model first and only escalates to paid Gemini if Ollama +#' isn't installed or fails -- so morie costs $0 to use as long +#' as you have a free Gemma / Qwen / Llama running locally +#' (e.g. \code{ollama pull gemma3:4b}). +#' @param cache_dir Directory holding the harvester's SIU.csv and +#' the optional \code{html/} subdirectory. +#' @param max_html_chars Soft cap on the HTML payload sent to the +#' model (default 80,000 -- larger than any real SIU report, +#' small enough to stay under typical context budgets). +#' @param mock_response_text For testing only: if non-NULL, skip the +#' network call and use this string as the model's raw reply. +#' @return A one-row data frame with the 64 morie SIU columns. Any +#' field the model could not extract is the empty string +#' (matching the C++ parser's convention). +#' @examples +#' \dontrun{ +#' Sys.setenv(GOOGLE_API_KEY = "your-gemini-key") +#' r <- morie_siu_llm_extract("17-OVI-201", model = "gemini") +#' # Diff parser vs LLM against the HTML: +#' morie_siu_compare( +#' "17-OVI-201", +#' external = r, +#' field_map = setNames(as.list(names(r)), names(r)), +#' external_case_col = "case_number" +#' ) +#' } +#' @export +morie_siu_llm_extract <- function(case_number, model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + mock_response_text = NULL) { + audit <- morie_siu_audit_case(case_number, + cache_dir = cache_dir, + fetch_if_missing = is.null(mock_response_text) + ) + html <- audit$report_html + if (!nzchar(html)) { + stop("No HTML available for '", case_number, "'.", + call. = FALSE + ) + } + if (nchar(html) > max_html_chars) html <- substr(html, 1L, max_html_chars) + + fields <- .siu_field_list() + prompt <- paste( + "You are extracting structured data from an Ontario Special", + "Investigations Unit (SIU) director's report. Return ONLY a JSON", + "object with these exact keys, in this order. Use the empty string", + "for fields the report does not state. Use ISO 8601 (YYYY-MM-DD)", + "for any *_iso date field; keep the report's original wording in", + "the matching *_raw field. For boolean fields, return \"true\" or", + "\"false\". Do not invent any values.\n\n", + "Keys:\n", paste(fields, collapse = ", "), "\n\n", + "Report HTML:\n", html + ) + + text <- .siu_llm_call(model, prompt, + mock_response_text = mock_response_text + ) + # Some models wrap JSON in ```json ... ```; strip if present. + text <- gsub("^```(?:json)?\\s*|\\s*```$", "", text, perl = TRUE) + parsed <- jsonlite::fromJSON(text, simplifyVector = TRUE) + # Coerce to a row of `fields` exact length + order, blanks where missing. + vals <- vapply(fields, function(f) { + v <- parsed[[f]] + if (is.null(v) || (length(v) == 1L && is.na(v))) { + "" + } else { + as.character(v)[1L] + } + }, character(1)) + out <- as.data.frame(t(vals), stringsAsFactors = FALSE) + names(out) <- fields + # Always overwrite the bookkeeping columns with what we know. + out$case_number <- as.character(audit$row$case_number[1L]) + out$drid <- as.character(audit$drid) + if (!is.na(audit$nrid)) out$nrid <- as.character(audit$nrid) + out$source_url_report <- paste0( + "https://www.siu.on.ca/en/directors_report_details.php?drid=", + audit$drid + ) + out$parser_version <- paste0("llm-", paste(model, collapse = "+")) + out$scraped_at_utc <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", + tz = "UTC" + ) + out +} + +#' Per-field anomaly check: does the parser's extraction match the HTML? +#' +#' For each populated field in the parser's row, ask the LLM whether +#' the extracted value is supported by the cached report HTML. Used +#' to surface fields where the regex parser is plausibly wrong -- +#' the LLM's verdicts are not authoritative, just an automated way +#' to triage which rows a human should re-read against the HTML. +#' +#' One API call is made per case (all fields batched into a single +#' prompt with structured-JSON output). +#' +#' @inheritParams morie_siu_llm_extract +#' @return A data frame with one row per populated parser field: +#' \code{field}, \code{parser_value}, \code{verdict} (one of +#' \code{"agree"} / \code{"disagree"} / \code{"unclear"}), and +#' \code{reason} (a short sentence pointing to the report passage). +#' @examples +#' \dontrun{ +#' Sys.setenv(GOOGLE_API_KEY = "your-gemini-key") +#' a <- morie_siu_anomaly_check("17-OVI-201", model = "gemini") +#' subset(a, verdict == "disagree") +#' } +#' @export +morie_siu_anomaly_check <- function(case_number, model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + mock_response_text = NULL) { + audit <- morie_siu_audit_case(case_number, + cache_dir = cache_dir, + fetch_if_missing = is.null(mock_response_text) + ) + html <- audit$report_html + if (!nzchar(html)) { + stop("No HTML available for '", case_number, "'.", + call. = FALSE + ) + } + if (nchar(html) > max_html_chars) html <- substr(html, 1L, max_html_chars) + + # Build the (field, parser_value) pairs we'll ask the LLM to check. + # Skip empty fields and pure-metadata columns the parser sets + # mechanically (source_url, drid, nrid, scraped_at_utc, etc.). + meta_cols <- c( + "drid", "nrid", "source_url_report", "source_url_news", + "scraped_at_utc", "parser_version", "_language" + ) + populated <- Filter( + function(p) nzchar(p$value) && !(p$field %in% meta_cols), + lapply(setdiff(names(audit$row), meta_cols), function(col) { + list(field = col, value = as.character(audit$row[[col]][1L])) + }) + ) + if (!length(populated)) { + return(data.frame( + field = character(0), parser_value = character(0), + verdict = character(0), reason = character(0), + stringsAsFactors = FALSE + )) + } + + pairs_block <- paste(vapply(populated, function(p) { + sprintf( + "- %s: %s", p$field, + substr(p$value, 1L, 400L) + ) # cap each value at 400 chars + }, character(1)), collapse = "\n") + + prompt <- paste0( + "You are auditing a regex-based extractor against an Ontario SIU\n", + "director's-report HTML page. For each field below, decide if the\n", + "extracted VALUE is supported by the REPORT.\n\n", + "Return ONLY a JSON array of objects, one per field, with keys:\n", + " field (string), verdict (one of \"agree\", \"disagree\",\n", + " \"unclear\"), reason (one short sentence quoting the report\n", + " passage if possible).\n", + "Use \"unclear\" if the report neither confirms nor contradicts.\n\n", + "FIELDS:\n", pairs_block, "\n\n", + "REPORT HTML:\n", html + ) + + text <- .siu_llm_call(model, prompt, + mock_response_text = mock_response_text + ) + text <- gsub("^```(?:json)?\\s*|\\s*```$", "", text, perl = TRUE) + rows <- jsonlite::fromJSON(text, simplifyVector = TRUE) + if (is.null(rows) || (is.data.frame(rows) && !nrow(rows))) { + return(data.frame( + field = character(0), parser_value = character(0), + verdict = character(0), reason = character(0), + stringsAsFactors = FALSE + )) + } + # Merge parser_value back in. + parser_vals <- setNames( + vapply(populated, function(p) p$value, character(1)), + vapply(populated, function(p) p$field, character(1)) + ) + rows <- as.data.frame(rows, stringsAsFactors = FALSE) + if (!"field" %in% names(rows)) { + stop("LLM response missing 'field' column", call. = FALSE) + } + rows$parser_value <- unname(parser_vals[rows$field]) + rows <- rows[, c("field", "parser_value", "verdict", "reason")] + rows +} + +#' Row-level sanity check on a parsed SIU table (regex-only, no LLM) +#' +#' For every row in a parser-emitted SIU table, flag cells that +#' don't match the expected format for their column -- `case_number` +#' that doesn't look like an SIU case id, `date_*_iso` that isn't a +#' valid ISO 8601 date, `number_of_*` that isn't a positive integer, +#' `charges_recommended` that isn't "Yes" / "No", etc. Returns a +#' data frame ranked by issue count so the most-broken rows surface +#' at the top for manual inspection against the cached HTML. +#' +#' Designed to be a fast first-pass quality filter -- runs in +#' milliseconds, no network, no LLM, no API key. Doesn't try to +#' verify correctness against the underlying report (that's what +#' \code{morie_siu_audit_columns()} is for); just checks that each +#' value MATCHES THE EXPECTED FORMAT for its field. A clean sanity +#' check is necessary but not sufficient for correctness. +#' +#' @param df A data frame in the morie SIU 64-column schema, or a +#' path to such a CSV. +#' @return A data frame with one row per source row, columns: +#' \code{case_number}, \code{drid}, \code{issues_count} (integer +#' number of suspicious cells), \code{issues} (semicolon-separated +#' string of \code{field:reason} pairs). Ordered descending by +#' \code{issues_count}. +#' @examples +#' \dontrun{ +#' csv <- morie_fetch_siu(cache_dir = tempdir(), cache_html = TRUE) +#' sanity <- morie_siu_sanity_check(csv) +#' head(sanity, 10) # worst 10 rows -- inspect against HTML +#' table(sanity$issues_count) +#' } +#' @export +morie_siu_sanity_check <- function(df) { + if (is.character(df) && length(df) == 1L && file.exists(df)) { + df <- utils::read.csv(df, + colClasses = "character", + check.names = FALSE + ) + } + stopifnot(is.data.frame(df), "case_number" %in% names(df)) + + check_case <- function(v) { + bad <- nzchar(v) & !grepl("^[0-9]{2}-[A-Z]{2,4}-[0-9]{3}$", v) + ifelse(bad, "case_number:bad-format", "") + } + check_iso <- function(v, col) { + bad <- nzchar(v) & !grepl("^[0-9]{4}-[0-9]{2}-[0-9]{2}$", v) + ifelse(bad, paste0(col, ":bad-iso"), "") + } + check_int <- function(v, col) { + bad <- nzchar(v) & !grepl("^[0-9]+$", v) + ifelse(bad, paste0(col, ":not-int"), "") + } + check_yn <- function(v, col) { + bad <- nzchar(v) & !v %in% c("Yes", "No") + ifelse(bad, paste0(col, ":not-Yes/No"), "") + } + check_gender <- function(v) { + bad <- nzchar(v) & !v %in% c("Male", "Female", "Non-binary") + ifelse(bad, "sex_gender_affected:bad-value", "") + } + check_officer_count <- function(v) { + # Should be "N SO" or "N SO M WO" or "N WO" + bad <- nzchar(v) & + !grepl("^[0-9]+ (SO|WO)( [0-9]+ WO)?$", v) + ifelse(bad, "number_of_officers_involved:bad-format", "") + } + check_nonempty <- function(v, col) { + # Used for fields that should always populate when case_number is set + ifelse(nzchar(df$case_number) & !nzchar(v), + paste0(col, ":empty-when-expected"), "" + ) + } + check_short <- function(v, col, min_chars) { + bad <- nzchar(v) & nchar(v) < min_chars + ifelse(bad, paste0(col, ":suspiciously-short"), "") + } + check_chrome <- function(v, col) { + # Page-chrome words that should never leak into report fields + bad <- grepl("Liaison Program|twitter\\.com|skipNavigation|sitemap", + v, + ignore.case = FALSE + ) + ifelse(bad, paste0(col, ":page-chrome-leak"), "") + } + + cells <- list( + check_case(df$case_number), + check_iso(df$date_of_incident_iso, "date_of_incident_iso"), + check_iso(df$date_of_injury_iso, "date_of_injury_iso"), + check_iso(df$date_siu_notified_iso, "date_siu_notified_iso"), + check_iso( + df$date_of_director_decision_iso, + "date_of_director_decision_iso" + ), + check_int( + df$number_of_affected_persons, + "number_of_affected_persons" + ), + check_int( + df$number_of_civilian_witnesses, + "number_of_civilian_witnesses" + ), + check_int( + df$number_of_subject_officials, + "number_of_subject_officials" + ), + check_int( + df$number_of_witness_officials, + "number_of_witness_officials" + ), + check_int(df$age_affected, "age_affected"), + check_officer_count(df$number_of_officers_involved), + check_yn(df$charges_recommended, "charges_recommended"), + check_yn( + df$directors_decision_reasonable, + "directors_decision_reasonable" + ), + check_gender(df$sex_gender_affected), + check_nonempty(df$police_service, "police_service"), + check_nonempty(df$narrative_summary, "narrative_summary"), + check_short(df$narrative_summary, "narrative_summary", 100L), + check_chrome(df$narrative_summary, "narrative_summary"), + check_chrome(df$supplemental_materials, "supplemental_materials"), + check_chrome( + df$mental_health_or_race_indications, + "mental_health_or_race_indications" + ) + ) + + collapsed <- do.call(paste, c(cells, sep = ";")) + collapsed <- gsub(";+", ";", collapsed) + collapsed <- gsub("^;|;$", "", collapsed) + issues_count <- vapply( + strsplit(collapsed, ";"), + function(x) sum(nzchar(x)), integer(1) + ) + out <- data.frame( + case_number = df$case_number, + drid = df$drid, + issues_count = issues_count, + issues = collapsed, + stringsAsFactors = FALSE + ) + out <- out[order(-out$issues_count, out$drid), , drop = FALSE] + rownames(out) <- NULL + out +} + +#' Translate SIU report text into any target language via local LLM +#' +#' For SIU cases whose parser-emitted text isn't in the reader's +#' preferred language, translate the long-form text fields into +#' \code{target_lang} via a local Ollama model (default $0 cost, +#' no API key) and save each translation as a canonical override. +#' Subsequent \code{morie_fetch_siu()} runs then return text in +#' \code{target_lang} for those cases automatically. +#' +#' Use cases: +#' \itemize{ +#' \item French-only SIU reports (a few per year of SIU output) +#' that have no English-paired drid -- translate to "en" +#' so downstream analyses can join them with the rest. +#' \item English SIU reports that a Hindi / Spanish / Mandarin / +#' Punjabi / Arabic / etc. reader needs -- translate to +#' their first language for accessibility. +#' \item Any cross-language pivot for community-oriented +#' publication, where the reader's first language isn't +#' what the SIU originally published in. +#' } +#' +#' Idempotent (skips cases that already have an override on file +#' for this \code{target_lang}). Self-improving (every translation +#' accumulates in \code{/canonical_overrides.csv}, so +#' the SIU table becomes more accessible every time you run this). +#' Maintainers can promote the resulting overrides into the +#' shipped \code{inst/extdata/siu_canonical_overrides.csv.gz}. +#' +#' For best speed/quality on multilingual translation use +#' \code{OLLAMA_MODEL=translategemma:latest} -- a Gemma model +#' fine-tuned for translation. Falls back to whatever model +#' \code{OLLAMA_MODEL} points at. +#' +#' @param target_lang Target ISO 639-1 language code (or full +#' language name). Defaults to \code{Sys.getenv("MORIE_USER_LANG")} +#' or, failing that, the first two characters of +#' \code{Sys.getenv("LANG")} -- so it picks up the user's +#' system locale automatically. +#' @param source_lang Source language code, or \code{NULL} (default) +#' to use each row's parsed \code{_language} field. +#' @param case_numbers Character vector of SIU case numbers to +#' translate. Defaults to every row whose \code{_language} +#' differs from \code{target_lang} and has no override yet. +#' @param model LLM model chain (see \code{\link{morie_siu_llm_extract}}). +#' Default \code{"ollama"} for $0 cost via local Gemma / etc. +#' @param fields Which text fields to translate. Defaults to the +#' long-form fields that benefit most from translation: +#' \code{narrative_summary}, \code{news_release_summary}, +#' \code{news_release_title}, \code{relevant_legislation}. +#' @param cache_dir Directory holding the harvester's SIU.csv and +#' cached HTML. +#' @param progress Print per-case progress. +#' @return Invisibly, a data frame of newly-recorded +#' (case_number, field, verified_value) translations. +#' @examples +#' \dontrun{ +#' Sys.setenv( +#' OLLAMA_HOST = "http://localhost:11434", +#' OLLAMA_MODEL = "translategemma:latest" +#' ) +#' csv <- morie_fetch_siu(cache_html = TRUE) +#' # Translate every non-English row to English: +#' morie_siu_translate(target_lang = "en") +#' # Or translate everything to Hindi for a Hindi-first reader: +#' morie_siu_translate(target_lang = "hi") +#' # Re-fetch picks up the new overrides automatically: +#' csv <- morie_fetch_siu(overwrite = TRUE) +#' } +#' @export +morie_siu_translate <- function( + target_lang = NULL, source_lang = NULL, + case_numbers = NULL, model = "ollama", + fields = c( + "narrative_summary", "news_release_summary", + "news_release_title", "relevant_legislation" + ), + cache_dir = file.path(tempdir(), "morie", "siu"), progress = TRUE +) { + if (is.null(target_lang) || !nzchar(target_lang)) { + target_lang <- Sys.getenv("MORIE_USER_LANG", unset = "") + if (!nzchar(target_lang)) { + sys_lang <- Sys.getenv("LANG", unset = "") + target_lang <- if (nzchar(sys_lang)) { + substr(sys_lang, 1L, 2L) + } else { + "en" + } + } + } + # Two-letter ISO is enough for the model to know what to do; + # also accept full names like "English", "Hindi". + target_lang <- tolower(target_lang) + .siu_translate_impl( + target_lang = target_lang, source_lang = source_lang, + case_numbers = case_numbers, model = model, + fields = fields, cache_dir = cache_dir, progress = progress + ) +} + +#' @rdname morie_siu_translate +#' @description \code{morie_siu_translate_fr_to_en} is a thin +#' back-compat wrapper that calls \code{morie_siu_translate} +#' with \code{target_lang = "en", source_lang = "fr"}. +#' @export +morie_siu_translate_fr_to_en <- function( + case_numbers = NULL, model = "ollama", + fields = c( + "narrative_summary", "news_release_summary", + "news_release_title", "relevant_legislation" + ), + cache_dir = file.path(tempdir(), "morie", "siu"), progress = TRUE +) { + .siu_translate_impl( + target_lang = "en", source_lang = "fr", + case_numbers = case_numbers, model = model, + fields = fields, cache_dir = cache_dir, progress = progress + ) +} + +.siu_translate_impl <- function( + target_lang, source_lang, case_numbers, model, + fields, cache_dir, progress +) { + cache_dir <- path.expand(cache_dir) + csv_path <- file.path(cache_dir, "SIU.csv") + if (!file.exists(csv_path)) { + stop("No SIU.csv at '", csv_path, "'; run morie_fetch_siu() first.", + call. = FALSE + ) + } + df <- utils::read.csv(csv_path, + colClasses = "character", + check.names = FALSE + ) + + # Default to every row whose source language differs from the + # target AND that has no override yet for the narrative_summary + # field. (One override is enough to mark "already translated"; + # we don't re-translate the same row.) + existing <- .siu_load_canonical_overrides(user_cache_dir = cache_dir) + done <- if (!is.null(existing)) { + existing$case_number[existing$field == "narrative_summary"] + } else { + character(0) + } + + if (is.null(case_numbers)) { + lang_col <- df[["_language"]] + if (is.null(source_lang)) { + # Translate every row whose detected language is NOT the + # target language (covers fr, unknown, etc.). + candidates <- df$case_number[lang_col != target_lang & + nzchar(df$case_number)] + } else { + candidates <- df$case_number[lang_col == source_lang & + nzchar(df$case_number)] + } + case_numbers <- setdiff(candidates, done) + } else { + case_numbers <- setdiff(case_numbers, done) + } + if (!length(case_numbers)) { + if (progress) message("Nothing to translate; all caught up.") + return(invisible(data.frame())) + } + if (progress) { + message( + "Translating ", length(case_numbers), + " case(s) to ", sQuote(target_lang), " via ", + paste(model, collapse = "+"), " ..." + ) + } + # Friendly full-name lookup for common target codes; falls back + # to whatever the caller passed. + lang_full <- c( + en = "English", fr = "French", es = "Spanish", + hi = "Hindi", pa = "Punjabi", ur = "Urdu", + zh = "Mandarin Chinese", ar = "Arabic", + de = "German", it = "Italian", pt = "Portuguese", + ja = "Japanese", ko = "Korean", ru = "Russian", + tr = "Turkish", vi = "Vietnamese", tl = "Tagalog", + so = "Somali", am = "Amharic", ta = "Tamil", + bn = "Bengali", gu = "Gujarati" + ) + target_name <- if (target_lang %in% names(lang_full)) { + lang_full[[target_lang]] + } else { + target_lang + } + + acc <- list() + for (i in seq_along(case_numbers)) { + cn <- case_numbers[i] + row <- df[df$case_number == cn, , drop = FALSE] + if (!nrow(row)) next + # Build a single prompt translating each non-empty field at once. + items <- lapply(fields, function(f) { + v <- as.character(row[[f]][1L]) + if (!nzchar(v)) { + return(NULL) + } + list(field = f, value = substr(v, 1L, 8000L)) + }) + items <- Filter(Negate(is.null), items) + if (!length(items)) next + + body <- paste(vapply(items, function(it) { + sprintf("[%s]\n%s\n[/%s]", it$field, it$value, it$field) + }, character(1)), collapse = "\n\n") + prompt <- paste0( + "Translate the following Ontario SIU director's-report text\n", + "fields into clear professional ", target_name, ".\n", + "Preserve all proper names, dates, case identifiers, and\n", + "legal references verbatim. Return ONLY a JSON object whose\n", + "keys are the field names and whose values are the\n", + target_name, " translations.\n\n", body + ) + + text <- tryCatch( + .siu_llm_call(model, prompt, timeout_s = 120L), + error = function(e) { + if (progress) message(" skip ", cn, ": ", conditionMessage(e)) + NULL + } + ) + if (is.null(text)) next + text <- gsub("^```(?:json)?\\s*|\\s*```$", "", text, perl = TRUE) + parsed <- tryCatch(jsonlite::fromJSON(text, simplifyVector = TRUE), + error = function(e) NULL + ) + if (is.null(parsed)) next + + for (it in items) { + en_val <- parsed[[it$field]] + if (is.null(en_val) || !nzchar(as.character(en_val))) next + morie_siu_record_correction( + case_number = cn, field = it$field, + verified_value = as.character(en_val)[1L], + note = sprintf( + "auto-translated to %s via %s", + target_lang, paste(model, collapse = "+") + ), + cache_dir = cache_dir + ) + acc[[length(acc) + 1L]] <- data.frame( + case_number = cn, field = it$field, + verified_value = as.character(en_val)[1L], + stringsAsFactors = FALSE + ) + } + if (progress) { + message(" ", i, "/", length(case_numbers), " ", cn, " done") + } + } + + if (!length(acc)) { + return(invisible(data.frame())) + } + out <- do.call(rbind, acc) + rownames(out) <- NULL + if (progress) { + message( + "Translated ", nrow(out), + " (case, field) cells; written to ", + file.path(cache_dir, "canonical_overrides.csv") + ) + } + invisible(out) +} + +#' Per-column accuracy audit: estimate every SIU column's correctness +#' +#' Runs \code{morie_siu_anomaly_check()} on a vector of case_numbers +#' and aggregates per-field across them. Output is a data frame with +#' one row per SIU column, ordered by how often the LLM auditor +#' agreed with the C++ parser. The worst-ranked rows are the +#' parser fields that most deserve regex / extraction-logic fixes. +#' +#' Examples of LLM-flagged disagreements are attached as the +#' \code{"examples"} attribute of the returned data frame (one +#' nested data frame per field), with at most +#' \code{max_examples_per_field} cases each. Each example carries +#' the case_number, the parser_value, and the LLM's one-sentence +#' reason -- enough for a maintainer to pop the cached HTML for +#' that case, see who's right, and decide whether to refine the +#' regex pattern for that field. +#' +#' Designed for cheap local audit: with \code{model = "ollama"} +#' pointed at a local Gemma / Qwen / DeepSeek instance, auditing +#' 50-100 cases costs zero API spend and finishes in a few +#' minutes. With \code{model = c("gemini", "ollama")} the chain +#' uses paid Gemini first and silently falls back to the local +#' model on quota / network errors. +#' +#' @inheritParams morie_siu_anomaly_check +#' @param case_numbers Character vector of SIU case numbers to audit. +#' @param max_examples_per_field Maximum disagreement examples +#' retained per field (default 5). +#' @param progress Logical; print a per-case progress line. +#' @return A data frame with columns \code{field}, \code{n_audited}, +#' \code{n_agree}, \code{n_disagree}, \code{n_unclear}, +#' \code{agree_rate}. Sorted ascending by \code{agree_rate} so the +#' most-broken fields land at the top. The \code{"examples"} +#' attribute holds nested data frames of flagged cases per field. +#' @examples +#' \dontrun{ +#' Sys.setenv( +#' OLLAMA_HOST = "http://localhost:11434", +#' OLLAMA_MODEL = "gemma3:4b" +#' ) +#' csv <- morie_fetch_siu(cache_html = TRUE) +#' df <- utils::read.csv(csv, colClasses = "character") +#' sample <- sample(df$case_number[nzchar(df$case_number)], 50L) +#' audit <- morie_siu_audit_columns(sample, model = "ollama") +#' # Worst 8 fields, ripe for parser fixes: +#' head(audit, 8) +#' # See concrete disagreements for the worst field: +#' attr(audit, "examples")[[audit$field[1L]]] +#' } +#' @export +morie_siu_audit_columns <- function(case_numbers, model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + max_examples_per_field = 5L, + progress = TRUE) { + case_numbers <- as.character(case_numbers) + if (!length(case_numbers)) { + stop("`case_numbers` must be non-empty.", call. = FALSE) + } + verdicts <- vector("list", length(case_numbers)) + for (i in seq_along(case_numbers)) { + if (progress) { + message( + "auditing ", case_numbers[i], " [", i, "/", + length(case_numbers), "] ..." + ) + } + v <- tryCatch( + morie_siu_anomaly_check(case_numbers[i], + model = model, + cache_dir = cache_dir, + max_html_chars = max_html_chars + ), + error = function(e) { + if (progress) message(" skipped: ", conditionMessage(e)) + NULL + } + ) + if (!is.null(v) && nrow(v)) { + v$case_number <- case_numbers[i] + verdicts[[i]] <- v + } + } + verdicts <- Filter(Negate(is.null), verdicts) + if (!length(verdicts)) { + stop("All audit attempts failed. Check your model / API key.", + call. = FALSE + ) + } + all <- do.call(rbind, verdicts) + + fields <- sort(unique(all$field)) + per_field <- lapply(fields, function(fld) { + sub <- all[all$field == fld, , drop = FALSE] + n <- nrow(sub) + agree <- sum(sub$verdict == "agree") + disagree <- sum(sub$verdict == "disagree") + unclear <- sum(sub$verdict == "unclear") + # Sample disagreement examples (and append any "unclear" until we + # hit the cap, so even mostly-unclear fields show what tripped them). + bad <- sub[sub$verdict %in% c("disagree", "unclear"), , drop = FALSE] + bad <- bad[order(match(bad$verdict, c("disagree", "unclear"))), , + drop = FALSE + ] + examples <- if (nrow(bad)) { + bad[seq_len(min(max_examples_per_field, nrow(bad))), + c("case_number", "parser_value", "verdict", "reason"), + drop = FALSE + ] + } else { + NULL + } + list( + field = fld, n = n, agree = agree, disagree = disagree, + unclear = unclear, examples = examples + ) + }) + + out <- data.frame( + field = vapply(per_field, function(x) x$field, character(1)), + n_audited = vapply(per_field, function(x) x$n, integer(1)), + n_agree = vapply(per_field, function(x) x$agree, integer(1)), + n_disagree = vapply(per_field, function(x) x$disagree, integer(1)), + n_unclear = vapply(per_field, function(x) x$unclear, integer(1)), + stringsAsFactors = FALSE + ) + out$agree_rate <- ifelse(out$n_audited > 0L, + out$n_agree / out$n_audited, NA_real_ + ) + out <- out[order(out$agree_rate, out$field), , drop = FALSE] + rownames(out) <- NULL + attr(out, "examples") <- setNames( + lapply(per_field, function(x) x$examples), + vapply(per_field, function(x) x$field, character(1)) + ) + out +} diff --git a/r-package/morie/R/smixd.R b/r-package/morie/R/smixd.R index e9a9f2e5a3..cf7d9b1766 100644 --- a/r-package/morie/R/smixd.R +++ b/r-package/morie/R/smixd.R @@ -1,4 +1,35 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: spatial-mixed-model REML negative log-likelihood. +# Extracted from the smixd() optimiser closure so the non-positive- +# -definite-covariance, singular-information and non-positive-variance +# guards are all directly unit-testable. `theta` = c(log phi, log nu). +.smixd_negreml <- function(theta, D, n, X, y, p) { + phi <- exp(theta[1]) + nu <- exp(theta[2]) + Sigma <- exp(-D / phi) + nu * diag(n) + L <- tryCatch(chol(Sigma), error = function(e) NULL) + if (is.null(L)) { + return(1e12) + } + Xw <- backsolve(L, X, transpose = TRUE) + yw <- backsolve(L, y, transpose = TRUE) + XtSiX <- crossprod(Xw) + L2 <- tryCatch(chol(XtSiX), error = function(e) NULL) + if (is.null(L2)) { + return(1e12) + } + beta <- as.numeric(solve(XtSiX, crossprod(Xw, yw))) + resid <- yw - Xw %*% beta + sigma2 <- as.numeric(sum(resid^2)) / max(n - p, 1) + if (sigma2 <= 0) { + return(1e12) + } + logdet_S <- 2 * sum(log(diag(L))) + logdet_K <- 2 * sum(log(diag(L2))) + 0.5 * (logdet_S + logdet_K + (n - p) * log(2 * pi * sigma2) + (n - p)) +} + #' Spatial linear mixed model via REML. #' #' Y = X beta + delta + eps, @@ -11,40 +42,27 @@ #' @return Named list: estimate, se, sigma2, tau2, phi, n, method. #' @references Patterson & Thompson (1971); Schabenberger & Gotway (2005), Ch 5. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' smixd(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export smixd <- function(x, y, coords) { - X <- as.matrix(x); y <- as.numeric(y); n <- length(y) - coords <- if (is.matrix(coords)) coords else + X <- as.matrix(x) + y <- as.numeric(y) + n <- length(y) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } p <- ncol(X) D <- as.matrix(stats::dist(coords)) h_max <- max(D, 1) - neg_reml <- function(theta) { - phi <- exp(theta[1]); nu <- exp(theta[2]) - Sigma <- exp(-D / phi) + nu * diag(n) - L <- tryCatch(chol(Sigma), error = function(e) NULL) - if (is.null(L)) return(1e12) - Xw <- backsolve(L, X, transpose = TRUE) - yw <- backsolve(L, y, transpose = TRUE) - XtSiX <- crossprod(Xw) - L2 <- tryCatch(chol(XtSiX), error = function(e) NULL) - if (is.null(L2)) return(1e12) - beta <- as.numeric(solve(XtSiX, crossprod(Xw, yw))) - resid <- yw - Xw %*% beta - sigma2 <- as.numeric(sum(resid ^ 2)) / max(n - p, 1) - if (sigma2 <= 0) return(1e12) - logdet_S <- 2 * sum(log(diag(L))) - logdet_K <- 2 * sum(log(diag(L2))) - 0.5 * (logdet_S + logdet_K + (n - p) * log(2 * pi * sigma2) + (n - p)) - } + neg_reml <- function(theta) .smixd_negreml(theta, D, n, X, y, p) res <- stats::optim(c(log(h_max / 3), log(0.1)), neg_reml, - method = "Nelder-Mead", - control = list(maxit = 400, reltol = 1e-6)) - phi <- exp(res$par[1]); nu <- exp(res$par[2]) + method = "Nelder-Mead", + control = list(maxit = 400, reltol = 1e-6) + ) + phi <- exp(res$par[1]) + nu <- exp(res$par[2]) Sigma <- exp(-D / phi) + nu * diag(n) L <- chol(Sigma) Xw <- backsolve(L, X, transpose = TRUE) @@ -52,12 +70,14 @@ smixd <- function(x, y, coords) { XtSiX <- crossprod(Xw) beta <- as.numeric(solve(XtSiX, crossprod(Xw, yw))) resid <- yw - Xw %*% beta - sigma2 <- as.numeric(sum(resid ^ 2)) / max(n - p, 1) + sigma2 <- as.numeric(sum(resid^2)) / max(n - p, 1) tau2 <- nu * sigma2 se_beta <- sqrt(diag(sigma2 * solve(XtSiX))) - list(estimate = beta, se = se_beta, sigma2 = sigma2, tau2 = tau2, - phi = phi, n = n, - method = "Spatial linear mixed model (REML, exponential covariance)") + list( + estimate = beta, se = se_beta, sigma2 = sigma2, tau2 = tau2, + phi = phi, n = n, + method = "Spatial linear mixed model (REML, exponential covariance)" + ) } # CANONICAL TEST @@ -66,4 +86,4 @@ smixd <- function(x, y, coords) { #' @rdname smixd #' @keywords internal #' @export -spatial_mixed_model <- smixd +morie_spatial_mixed_model <- smixd diff --git a/r-package/morie/R/sobls.R b/r-package/morie/R/sobls.R index f14e4d9748..4c23a62eef 100644 --- a/r-package/morie/R/sobls.R +++ b/r-package/morie/R/sobls.R @@ -16,14 +16,21 @@ sobls <- function(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) { sample <- NULL if (requireNamespace("randtoolbox", quietly = TRUE)) { sobol_fn <- getFromNamespace("sobol", "randtoolbox") - sample <- sobol_fn(n = as.integer(N), dim = as.integer(d), - scrambling = if (scramble) 1L else 0L, seed = seed) + # randtoolbox's Owen scrambling is disabled upstream in current + # releases; requesting it (scrambling = 1) only emits a spurious + # "scrambling is currently disabled" warning while returning the + # unscrambled sequence anyway, so request 0 unconditionally. + sample <- sobol_fn( + n = as.integer(N), dim = as.integer(d), + scrambling = 0L, seed = seed + ) if (!is.matrix(sample)) sample <- matrix(sample, ncol = d) } else { # Halton sequence fallback (pure R) primes <- c(2, 3, 5, 7, 11, 13, 17, 19, 23, 29)[seq_len(d)] halton <- function(i, b) { - f <- 1; r <- 0 + f <- 1 + r <- 0 while (i > 0) { f <- f / b r <- r + f * (i %% b) @@ -32,11 +39,14 @@ sobls <- function(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) { r } sample <- matrix(0, N, d) - for (j in seq_len(d)) + for (j in seq_len(d)) { sample[, j] <- vapply(seq_len(N), halton, numeric(1), b = primes[j]) + } } - out <- list(sample = sample, N = as.integer(N), d = as.integer(d), - method = "Sobol QMC (Sobol 1967)") + out <- list( + sample = sample, N = as.integer(N), d = as.integer(d), + method = "Sobol QMC (Sobol 1967)" + ) if (!is.null(f)) { fv <- apply(sample, 1, f) out$estimate <- mean(fv) @@ -52,4 +62,4 @@ sobls <- function(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) { #' @rdname sobls #' @keywords internal #' @export -sobol_sequence <- sobls +morie_sobol_sequence <- sobls diff --git a/r-package/morie/R/spblk.R b/r-package/morie/R/spblk.R index 32769ff2d1..4f80e050bb 100644 --- a/r-package/morie/R/spblk.R +++ b/r-package/morie/R/spblk.R @@ -14,28 +14,37 @@ #' @return Named list: estimate, se, n, method. #' @references Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export spblk <- function(x, coords, blocks, n_quad = 25, nugget = 0, sill = 1, range_ = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } d <- ncol(coords) - c0 <- nugget; c1 <- sill - nugget; a <- range_ + c0 <- nugget + c1 <- sill - nugget + a <- range_ cov_fn <- function(h) c1 * exp(-h / a) + ifelse(h == 0, c0, 0) D <- as.matrix(stats::dist(coords)) C <- cov_fn(D) A <- matrix(0, n + 1, n + 1) - A[1:n, 1:n] <- C; A[1:n, n + 1] <- 1; A[n + 1, 1:n] <- 1 + A[1:n, 1:n] <- C + A[1:n, n + 1] <- 1 + A[n + 1, 1:n] <- 1 block_to_pts <- function(b) { b <- if (is.matrix(b)) b else matrix(as.numeric(unlist(b)), ncol = d) if (nrow(b) == 2 && ncol(b) == d) { - lo <- b[1, ]; hi <- b[2, ] - if (d == 1) return(matrix(seq(lo[1], hi[1], length.out = n_quad), ncol = 1)) + lo <- b[1, ] + hi <- b[2, ] + if (d == 1) { + return(matrix(seq(lo[1], hi[1], length.out = n_quad), ncol = 1)) + } if (d == 2) { k <- round(sqrt(n_quad)) g1 <- seq(lo[1], hi[1], length.out = k) @@ -47,9 +56,10 @@ spblk <- function(x, coords, blocks, n_quad = 25, b } m <- length(blocks) - ests <- numeric(m); ses <- numeric(m) + ests <- numeric(m) + ses <- numeric(m) pairwise_cov <- function(P, Q) { - DPQ <- sqrt(outer(rowSums(P ^ 2), rowSums(Q ^ 2), "+") - 2 * P %*% t(Q)) + DPQ <- sqrt(outer(rowSums(P^2), rowSums(Q^2), "+") - 2 * P %*% t(Q)) DPQ[DPQ < 0] <- 0 cov_fn(DPQ) } @@ -59,16 +69,20 @@ spblk <- function(x, coords, blocks, n_quad = 25, Cbar_BB <- mean(pairwise_cov(pts, pts)) rhs <- c(Cbar_iB, 1) sol <- tryCatch(solve(A, rhs), - error = function(e) qr.solve(A, rhs)) - lam <- sol[1:n]; mu <- sol[n + 1] + error = function(e) qr.solve(A, rhs) + ) + lam <- sol[1:n] + mu <- sol[n + 1] ests[b_idx] <- sum(lam * x) ses[b_idx] <- sqrt(max(Cbar_BB - sum(lam * Cbar_iB) - mu, 0)) } - list(estimate = ests, se = ses, n = n, - method = "Ordinary block kriging (exp. cov, MC quadrature)") + list( + estimate = ests, se = ses, n = n, + method = "Ordinary block kriging (exp. cov, MC quadrature)" + ) } #' @rdname spblk #' @keywords internal #' @export -spatial_block_kriging <- spblk +morie_spatial_block_kriging <- spblk diff --git a/r-package/morie/R/spcrs.R b/r-package/morie/R/spcrs.R index 2864a19607..cb43cd6a19 100644 --- a/r-package/morie/R/spcrs.R +++ b/r-package/morie/R/spcrs.R @@ -9,16 +9,19 @@ #' @return Named list: estimate (MSPE, RMSPE, MAE, residuals), n, method. #' @references Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' spcrs(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export spcrs <- function(x, coords, nugget = 0, sill = 1, range_ = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) - c0 <- nugget; c1 <- sill - nugget; a <- range_ + } + c0 <- nugget + c1 <- sill - nugget + a <- range_ D <- as.matrix(stats::dist(coords)) cov_fn <- function(h) c1 * exp(-h / a) + ifelse(h == 0, c0, 0) resid <- numeric(n) @@ -28,20 +31,27 @@ spcrs <- function(x, coords, nugget = 0, sill = 1, range_ = 1) { Cii <- cov_fn(D[sel, sel]) c_vec <- cov_fn(D[i, sel]) A <- matrix(0, m + 1, m + 1) - A[1:m, 1:m] <- Cii; A[1:m, m + 1] <- 1; A[m + 1, 1:m] <- 1 + A[1:m, 1:m] <- Cii + A[1:m, m + 1] <- 1 + A[m + 1, 1:m] <- 1 rhs <- c(c_vec, 1) sol <- tryCatch(solve(A, rhs), - error = function(e) qr.solve(A, rhs)) + error = function(e) qr.solve(A, rhs) + ) z_hat <- sum(sol[1:m] * x[sel]) resid[i] <- x[i] - z_hat } - mspe <- mean(resid ^ 2) - list(estimate = list(MSPE = mspe, RMSPE = sqrt(mspe), - MAE = mean(abs(resid)), residuals = resid), - n = n, method = "LOO cross-validation for ordinary kriging") + mspe <- mean(resid^2) + list( + estimate = list( + MSPE = mspe, RMSPE = sqrt(mspe), + MAE = mean(abs(resid)), residuals = resid + ), + n = n, method = "LOO cross-validation for ordinary kriging" + ) } #' @rdname spcrs #' @keywords internal #' @export -spatial_cross_validation <- spcrs +morie_spatial_cross_validation <- spcrs diff --git a/r-package/morie/R/specf.R b/r-package/morie/R/specf.R index 1d2e29d7c1..880db4ae3b 100644 --- a/r-package/morie/R/specf.R +++ b/r-package/morie/R/specf.R @@ -8,13 +8,11 @@ #' @return Named list with \code{frequencies, psd, n_segments, nperseg, #' fs, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_spectral_density(x = rnorm(50)) #' @export -spectral_density <- function(x, fs = 1, nperseg = NULL) { - r <- as.numeric(x); n <- length(r) +morie_spectral_density <- function(x, fs = 1, nperseg = NULL) { + r <- as.numeric(x) + n <- length(r) if (n < 8) stop("Need >=8 obs.") if (is.null(nperseg)) nperseg <- max(n %/% 4, 8) nperseg <- min(nperseg, n) @@ -22,10 +20,12 @@ spectral_density <- function(x, fs = 1, nperseg = NULL) { win <- 0.5 - 0.5 * cos(2 * pi * (0:(nperseg - 1)) / max(nperseg - 1, 1)) U <- sum(win^2) nfreq <- nperseg %/% 2 + 1 - S <- numeric(nfreq); nseg <- 0; start <- 1 + S <- numeric(nfreq) + nseg <- 0 + start <- 1 while (start + nperseg - 1 <= n) { seg <- (r[start:(start + nperseg - 1)] - - mean(r[start:(start + nperseg - 1)])) * win + mean(r[start:(start + nperseg - 1)])) * win Fk <- fft(seg)[1:nfreq] S <- S + Mod(Fk)^2 nseg <- nseg + 1 @@ -33,7 +33,9 @@ spectral_density <- function(x, fs = 1, nperseg = NULL) { } S <- S / (nseg * U * fs) freqs <- seq(0, fs / 2, length.out = nfreq) - list(frequencies = freqs, psd = S, n_segments = nseg, - nperseg = nperseg, fs = fs, n = n, - method = "Welch PSD (Hann window, 50% overlap, base R)") + list( + frequencies = freqs, psd = S, n_segments = nseg, + nperseg = nperseg, fs = fs, n = n, + method = "Welch PSD (Hann window, 50% overlap, base R)" + ) } diff --git a/r-package/morie/R/spqkv.R b/r-package/morie/R/spqkv.R index a8128b9b19..377d8b0560 100644 --- a/r-package/morie/R/spqkv.R +++ b/r-package/morie/R/spqkv.R @@ -11,14 +11,19 @@ #' @return Named list with tensor (additive mask), boolean, density, method. #' @keywords internal sparse_attention <- function(x, window = 4L, stride = 8L, - n_random = 0L, seed = 0L) { - N <- if (length(x) == 1L && is.numeric(x)) as.integer(x) - else if (!is.null(dim(x))) dim(x)[length(dim(x)) - 1L] - else length(x) + n_random = 0L, seed = 0L) { + N <- if (length(x) == 1L && is.numeric(x)) { + as.integer(x) + } else if (!is.null(dim(x))) { + dim(x)[length(dim(x)) - 1L] + } else { + length(x) + } set.seed(seed) M <- matrix(FALSE, N, N) for (i in seq_len(N)) { - lo <- max(1L, i - window); hi <- min(N, i + window) + lo <- max(1L, i - window) + hi <- min(N, i + window) M[i, lo:hi] <- TRUE M[i, seq.int(1L, N, by = stride)] <- TRUE if (n_random > 0L) { @@ -28,6 +33,8 @@ sparse_attention <- function(x, window = 4L, stride = 8L, } additive <- ifelse(M, 0, -Inf) density <- sum(M) / (N * N) - list(tensor = additive, boolean = M, density = density, - method = "sparse-attention") + list( + tensor = additive, boolean = M, density = density, + method = "sparse-attention" + ) } diff --git a/r-package/morie/R/sptag.R b/r-package/morie/R/sptag.R index 0da99e7681..b1c7b8f0e9 100644 --- a/r-package/morie/R/sptag.R +++ b/r-package/morie/R/sptag.R @@ -9,33 +9,40 @@ #' @return Named list with `agreement` (n by n), `mean_agreement`, #' `n`, `m`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' sptag(x = rnorm(50)) #' @export sptag <- function(x) { M <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) - n <- nrow(M); m <- ncol(M) - if (n < 2L) - return(list(agreement = diag(n), mean_agreement = NA_real_, - n = n, m = m, method = "spatial_agreement")) - A <- diag(n); valid <- !is.na(M) - for (i in seq_len(n - 1L)) for (j in (i + 1L):n) { - both <- valid[i, ] & valid[j, ] - denom <- sum(both) - if (denom == 0L) A[i, j] <- A[j, i] <- NA_real_ - else { - same <- sum(M[i, both] == M[j, both]) - A[i, j] <- A[j, i] <- same / denom + n <- nrow(M) + m <- ncol(M) + if (n < 2L) { + return(list( + agreement = diag(n), mean_agreement = NA_real_, + n = n, m = m, method = "morie_spatial_agreement" + )) + } + A <- diag(n) + valid <- !is.na(M) + for (i in seq_len(n - 1L)) { + for (j in (i + 1L):n) { + both <- valid[i, ] & valid[j, ] + denom <- sum(both) + if (denom == 0L) { + A[i, j] <- A[j, i] <- NA_real_ + } else { + same <- sum(M[i, both] == M[j, both]) + A[i, j] <- A[j, i] <- same / denom + } } } iu <- upper.tri(A) - list(agreement = A, mean_agreement = mean(A[iu], na.rm = TRUE), - n = n, m = m, method = "spatial_agreement") + list( + agreement = A, mean_agreement = mean(A[iu], na.rm = TRUE), + n = n, m = m, method = "morie_spatial_agreement" + ) } #' @keywords internal #' @rdname sptag #' @export -spatial_agreement <- sptag +morie_spatial_agreement <- sptag diff --git a/r-package/morie/R/sptau.R b/r-package/morie/R/sptau.R index 86ba1be0c8..d405f38fdb 100644 --- a/r-package/morie/R/sptau.R +++ b/r-package/morie/R/sptau.R @@ -14,54 +14,66 @@ #' z_score, n, method. #' @references Cliff & Ord (1981). Schabenberger & Gotway (2005), Ch 1. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export sptau <- function(x, w) { - x <- as.numeric(x); n <- length(x) + x <- as.numeric(x) + n <- length(x) W <- as.matrix(w) - if (!all(dim(W) == c(n, n))) + if (!all(dim(W) == c(n, n))) { stop("w must be an n-by-n matrix") + } if (n < 3) { - return(list(statistic = NA_real_, p_value = NA_real_, - expectation = NA_real_, variance = NA_real_, - z_score = NA_real_, n = n, - method = "Moran's I (spatial autocorrelation)")) + return(list( + statistic = NA_real_, p_value = NA_real_, + expectation = NA_real_, variance = NA_real_, + z_score = NA_real_, n = n, + method = "Moran's I (spatial autocorrelation)" + )) } - xbar <- mean(x); z <- x - xbar + xbar <- mean(x) + z <- x - xbar S0 <- sum(W) num <- as.numeric(t(z) %*% W %*% z) - den <- sum(z ^ 2) - if (S0 == 0 || den == 0) - return(list(statistic = NA_real_, p_value = NA_real_, - expectation = NA_real_, variance = NA_real_, - z_score = NA_real_, n = n, - method = "Moran's I (spatial autocorrelation)")) + den <- sum(z^2) + if (S0 == 0 || den == 0) { + return(list( + statistic = NA_real_, p_value = NA_real_, + expectation = NA_real_, variance = NA_real_, + z_score = NA_real_, n = n, + method = "Moran's I (spatial autocorrelation)" + )) + } I <- (n / S0) * (num / den) EI <- -1 / (n - 1) - S1 <- 0.5 * sum((W + t(W)) ^ 2) - S2 <- sum((rowSums(W) + colSums(W)) ^ 2) - m2 <- mean(z ^ 2); m4 <- mean(z ^ 4) - b2 <- if (m2 > 0) m4 / (m2 ^ 2) else 3 - A <- n * ((n ^ 2 - 3 * n + 3) * S1 - n * S2 + 3 * S0 ^ 2) - B <- b2 * ((n ^ 2 - n) * S1 - 2 * n * S2 + 6 * S0 ^ 2) - Cc <- (n - 1) * (n - 2) * (n - 3) * S0 ^ 2 + S1 <- 0.5 * sum((W + t(W))^2) + S2 <- sum((rowSums(W) + colSums(W))^2) + m2 <- mean(z^2) + m4 <- mean(z^4) + b2 <- if (m2 > 0) m4 / (m2^2) else 3 + A <- n * ((n^2 - 3 * n + 3) * S1 - n * S2 + 3 * S0^2) + B <- b2 * ((n^2 - n) * S1 - 2 * n * S2 + 6 * S0^2) + Cc <- (n - 1) * (n - 2) * (n - 3) * S0^2 if (Cc <= 0) { - var_I <- NA_real_; z_sc <- NA_real_; p <- NA_real_ + var_I <- NA_real_ + z_sc <- NA_real_ + p <- NA_real_ } else { - var_I <- (A - B) / Cc - EI ^ 2 + var_I <- (A - B) / Cc - EI^2 if (is.na(var_I) || var_I <= 0) { - z_sc <- NA_real_; p <- NA_real_ + z_sc <- NA_real_ + p <- NA_real_ } else { z_sc <- (I - EI) / sqrt(var_I) p <- 2 * (1 - stats::pnorm(abs(z_sc))) } } - list(statistic = I, p_value = p, expectation = EI, - variance = var_I, z_score = z_sc, n = n, - method = "Moran's I (spatial autocorrelation)") + list( + statistic = I, p_value = p, expectation = EI, + variance = var_I, z_score = z_sc, n = n, + method = "Moran's I (spatial autocorrelation)" + ) } # CANONICAL TEST @@ -72,4 +84,4 @@ sptau <- function(x, w) { #' @rdname sptau #' @keywords internal #' @export -spatial_autocorrelation <- sptau +morie_spatial_autocorrelation <- sptau diff --git a/r-package/morie/R/sptrn.R b/r-package/morie/R/sptrn.R index d52c2a81aa..0e4135307c 100644 --- a/r-package/morie/R/sptrn.R +++ b/r-package/morie/R/sptrn.R @@ -9,25 +9,27 @@ #' @return Named list: estimate, se, r2, order, n, method. #' @references Schabenberger & Gotway (2005), Ch 2. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' sptrn(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export sptrn <- function(x, coords, order = 2) { - y <- as.numeric(x); n <- length(y) - coords <- if (is.matrix(coords)) coords else + y <- as.numeric(x) + n <- length(y) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } d <- ncol(coords) cols <- list(rep(1, n)) if (d == 1) { s <- coords[, 1] - for (k in seq_len(order)) cols[[length(cols) + 1]] <- s ^ k + for (k in seq_len(order)) cols[[length(cols) + 1]] <- s^k } else { - s1 <- coords[, 1]; s2 <- coords[, 2] + s1 <- coords[, 1] + s2 <- coords[, 2] if (order >= 1) cols <- c(cols, list(s1, s2)) - if (order >= 2) cols <- c(cols, list(s1 ^ 2, s2 ^ 2, s1 * s2)) - if (order >= 3) cols <- c(cols, list(s1 ^ 3, s2 ^ 3, s1 ^ 2 * s2, s1 * s2 ^ 2)) + if (order >= 2) cols <- c(cols, list(s1^2, s2^2, s1 * s2)) + if (order >= 3) cols <- c(cols, list(s1^3, s2^3, s1^2 * s2, s1 * s2^2)) if (order >= 4) stop("trend_order > 3 not supported") } F_ <- do.call(cbind, cols) @@ -36,12 +38,14 @@ sptrn <- function(x, coords, order = 2) { XtX <- crossprod(F_) beta <- as.numeric(solve(XtX, crossprod(F_, y))) e <- y - as.numeric(F_ %*% beta) - sigma2 <- sum(e ^ 2) / max(n - p, 1) + sigma2 <- sum(e^2) / max(n - p, 1) se <- sqrt(pmax(diag(sigma2 * solve(XtX)), 0)) - ss_tot <- sum((y - mean(y)) ^ 2) - r2 <- if (ss_tot > 0) 1 - sum(e ^ 2) / ss_tot else 1 - list(estimate = beta, se = se, r2 = r2, order = as.integer(order), - n = n, method = sprintf("Polynomial trend surface (order=%d, OLS)", order)) + ss_tot <- sum((y - mean(y))^2) + r2 <- if (ss_tot > 0) 1 - sum(e^2) / ss_tot else 1 + list( + estimate = beta, se = se, r2 = r2, order = as.integer(order), + n = n, method = sprintf("Polynomial trend surface (order=%d, OLS)", order) + ) } # CANONICAL TEST @@ -50,4 +54,4 @@ sptrn <- function(x, coords, order = 2) { #' @rdname sptrn #' @keywords internal #' @export -spatial_trend_surface <- sptrn +morie_spatial_trend_surface <- sptrn diff --git a/r-package/morie/R/ssmod.R b/r-package/morie/R/ssmod.R index b53400440f..db15627c59 100644 --- a/r-package/morie/R/ssmod.R +++ b/r-package/morie/R/ssmod.R @@ -6,52 +6,70 @@ #' @return Named list with \code{filtered_state, filtered_state_variance, #' smoothed_state, loglik, Q, R, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_state_space_model(x = rnorm(50)) #' @export -state_space_model <- function(x) { - y <- as.numeric(x); n <- length(y) +morie_state_space_model <- function(x) { + y <- as.numeric(x) + n <- length(y) if (n < 4) stop("Need >=4 obs.") if (requireNamespace("dlm", quietly = TRUE)) { - build <- function(p) dlm::dlmModPoly(order = 1, - dV = exp(p[1]), dW = exp(p[2])) - fit <- dlm::dlmMLE(y, parm = c(log(var(diff(y)) / 2), - log(var(diff(y)) / 2)), - build = build) + build <- function(p) { + dlm::dlmModPoly( + order = 1, + dV = exp(p[1]), dW = exp(p[2]) + ) + } + fit <- dlm::dlmMLE(y, + parm = c( + log(var(diff(y)) / 2), + log(var(diff(y)) / 2) + ), + build = build + ) mod <- build(fit$par) f <- dlm::dlmFilter(y, mod) s <- dlm::dlmSmooth(f) - return(list(filtered_state = as.numeric(f$m)[-1], - filtered_state_variance = sapply(dlm::dlmSvd2var(f$U.C, f$D.C), - function(x) x[1, 1])[-1], - smoothed_state = as.numeric(s$s)[-1], - loglik = -fit$value, - Q = exp(fit$par[2]), - R = exp(fit$par[1]), - n = n, - method = "Local-level Kalman via dlm")) + return(list( + filtered_state = as.numeric(f$m)[-1], + filtered_state_variance = vapply( + dlm::dlmSvd2var(f$U.C, f$D.C), + function(x) x[1, 1], numeric(1) + )[-1], + smoothed_state = as.numeric(s$s)[-1], + loglik = -fit$value, + Q = exp(fit$par[2]), + R = exp(fit$par[1]), + n = n, + method = "Local-level Kalman via dlm" + )) } - Q <- var(diff(y)) / 2; R <- var(diff(y)) / 2 - a <- numeric(n); P <- numeric(n) - a[1] <- y[1]; P[1] <- 1e7 + Q <- var(diff(y)) / 2 + R <- var(diff(y)) / 2 + a <- numeric(n) + P <- numeric(n) + a[1] <- y[1] + P[1] <- 1e7 ll <- 0 for (t in 2:n) { Pp <- P[t - 1] + Q v <- y[t] - a[t - 1] - Fv <- Pp + R; K <- Pp / Fv + Fv <- Pp + R + K <- Pp / Fv a[t] <- a[t - 1] + K * v P[t] <- Pp - K * Pp ll <- ll + -0.5 * (log(2 * pi * Fv) + v^2 / Fv) } - a_s <- a; P_s <- P + a_s <- a + P_s <- P for (t in (n - 1):1) { - Pp <- P[t] + Q; J <- P[t] / Pp + Pp <- P[t] + Q + J <- P[t] / Pp a_s[t] <- a[t] + J * (a_s[t + 1] - a[t]) P_s[t] <- P[t] + J^2 * (P_s[t + 1] - Pp) } - list(filtered_state = a, filtered_state_variance = P, - smoothed_state = a_s, loglik = ll, Q = Q, R = R, n = n, - method = "Local-level Kalman filter+smoother (base R)") + list( + filtered_state = a, filtered_state_variance = P, + smoothed_state = a_s, loglik = ll, Q = Q, R = R, n = n, + method = "Local-level Kalman filter+smoother (base R)" + ) } diff --git a/r-package/morie/R/stacv.R b/r-package/morie/R/stacv.R index 62bb374af5..622e162426 100644 --- a/r-package/morie/R/stacv.R +++ b/r-package/morie/R/stacv.R @@ -13,30 +13,35 @@ #' n, method. #' @references Cressie & Huang (1999); Schabenberger & Gotway (2005), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' stacv(x = rnorm(50), coords = matrix(runif(100), 50, 2), times = sort(cumsum(rexp(50)))) #' @export stacv <- function(x, coords, times, n_spatial_bins = 6, n_temporal_bins = 6, max_spatial = NULL, max_temporal = NULL) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } t <- as.numeric(times) - if (nrow(coords) != n || length(t) != n) + if (nrow(coords) != n || length(t) != n) { stop("shape mismatch among x, coords, times") + } xbar <- mean(x) sd_full <- as.matrix(stats::dist(coords)) td_full <- abs(outer(t, t, "-")) iu <- which(upper.tri(sd_full), arr.ind = TRUE) - sd_f <- sd_full[iu]; td_f <- td_full[iu] + sd_f <- sd_full[iu] + td_f <- td_full[iu] prods <- (x[iu[, 1]] - xbar) * (x[iu[, 2]] - xbar) - if (is.null(max_spatial)) + if (is.null(max_spatial)) { max_spatial <- if (max(sd_f) > 0) max(sd_f) / 2 else 1 - if (is.null(max_temporal)) + } + if (is.null(max_temporal)) { max_temporal <- if (max(td_f) > 0) max(td_f) / 2 else 1 + } s_edges <- seq(0, max_spatial, length.out = n_spatial_bins + 1) t_edges <- seq(0, max_temporal, length.out = n_temporal_bins + 1) C <- matrix(NA_real_, n_spatial_bins, n_temporal_bins) @@ -44,7 +49,7 @@ stacv <- function(x, coords, times, for (i in seq_len(n_spatial_bins)) { for (j in seq_len(n_temporal_bins)) { m <- sd_f > s_edges[i] & sd_f <= s_edges[i + 1] & - td_f > t_edges[j] & td_f <= t_edges[j + 1] + td_f > t_edges[j] & td_f <= t_edges[j + 1] k <- sum(m) counts[i, j] <- as.integer(k) if (k > 0) C[i, j] <- mean(prods[m]) @@ -52,9 +57,13 @@ stacv <- function(x, coords, times, } s_mids <- 0.5 * (s_edges[-1] + s_edges[-(n_spatial_bins + 1)]) t_mids <- 0.5 * (t_edges[-1] + t_edges[-(n_temporal_bins + 1)]) - list(estimate = list(C = C, spatial_bins = s_mids, - temporal_bins = t_mids, counts = counts), - n = n, method = "Empirical spatiotemporal autocovariance") + list( + estimate = list( + C = C, spatial_bins = s_mids, + temporal_bins = t_mids, counts = counts + ), + n = n, method = "Empirical spatiotemporal autocovariance" + ) } # CANONICAL TEST (3x3 spatiotemporal grid of x = i+t) @@ -62,4 +71,4 @@ stacv <- function(x, coords, times, #' @rdname stacv #' @keywords internal #' @export -spatiotemporal_autocovariance <- stacv +morie_spatiotemporal_autocovariance <- stacv diff --git a/r-package/morie/R/stkrg.R b/r-package/morie/R/stkrg.R index ce07177568..c0c72844b7 100644 --- a/r-package/morie/R/stkrg.R +++ b/r-package/morie/R/stkrg.R @@ -12,44 +12,55 @@ #' @return Named list: estimate, se, n, method. #' @references Schabenberger & Gotway (2005), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export stkrg <- function(x, coords, times, target, sill = 1, nugget = 0, range_s = 1, range_t = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } t <- as.numeric(times) if (nrow(coords) != n || length(t) != n) stop("shape mismatch") - s0 <- as.matrix(target$s0); t0 <- as.numeric(target$t0) + s0 <- as.matrix(target$s0) + t0 <- as.numeric(target$t0) if (ncol(s0) != ncol(coords)) stop("target coords dim mismatch") if (nrow(s0) != length(t0)) stop("target s0 and t0 must align") - c0 <- nugget; c1 <- sill - nugget + c0 <- nugget + c1 <- sill - nugget Dnn <- as.matrix(stats::dist(coords)) Tnn <- abs(outer(t, t, "-")) Cmat <- c1 * exp(-Dnn / range_s) * exp(-Tnn / range_t) + c0 * diag(n) A <- matrix(0, n + 1, n + 1) - A[1:n, 1:n] <- Cmat; A[1:n, n + 1] <- 1; A[n + 1, 1:n] <- 1 + A[1:n, 1:n] <- Cmat + A[1:n, n + 1] <- 1 + A[n + 1, 1:n] <- 1 var_total <- sill m <- nrow(s0) - ests <- numeric(m); ses <- numeric(m) + ests <- numeric(m) + ses <- numeric(m) for (k in seq_len(m)) { - d0s <- sqrt(colSums((t(coords) - s0[k, ]) ^ 2)) + d0s <- sqrt(colSums((t(coords) - s0[k, ])^2)) d0t <- abs(t0[k] - t) c_vec <- c1 * exp(-d0s / range_s) * exp(-d0t / range_t) rhs <- c(c_vec, 1) sol <- tryCatch(solve(A, rhs), - error = function(e) qr.solve(A, rhs)) - lam <- sol[1:n]; mu <- sol[n + 1] + error = function(e) qr.solve(A, rhs) + ) + lam <- sol[1:n] + mu <- sol[n + 1] ests[k] <- sum(lam * x) ses[k] <- sqrt(max(var_total - sum(lam * c_vec) - mu, 0)) } - list(estimate = if (m == 1) ests[1] else ests, - se = if (m == 1) ses[1] else ses, n = n, - method = "Spatiotemporal ordinary kriging (separable exponential)") + list( + estimate = if (m == 1) ests[1] else ests, + se = if (m == 1) ses[1] else ses, n = n, + method = "Spatiotemporal ordinary kriging (separable exponential)" + ) } # CANONICAL TEST -- predict at an observed site, should reproduce z @@ -57,4 +68,4 @@ stkrg <- function(x, coords, times, target, #' @rdname stkrg #' @keywords internal #' @export -spatiotemporal_kriging <- stkrg +morie_spatiotemporal_kriging <- stkrg diff --git a/r-package/morie/R/strat.R b/r-package/morie/R/strat.R index 5b346d094d..5bae6ee692 100644 --- a/r-package/morie/R/strat.R +++ b/r-package/morie/R/strat.R @@ -25,8 +25,10 @@ strat <- function(data, y = "y", strata = "stratum", pop_sizes = NULL) { W_h <- n_h / sum(n_h) } else { N <- sum(pop_sizes) - W_h <- vapply(strata_names, function(s) pop_sizes[[as.character(s)]] / N, - numeric(1)) + W_h <- vapply( + strata_names, function(s) pop_sizes[[as.character(s)]] / N, + numeric(1) + ) } est <- sum(W_h * yb_h) var_st <- sum(W_h^2 * s2_h / n_h) @@ -34,11 +36,13 @@ strat <- function(data, y = "y", strata = "stratum", pop_sizes = NULL) { z <- stats::qnorm(0.975) names(W_h) <- as.character(strata_names) names(yb_h) <- as.character(strata_names) - list(estimate = as.numeric(est), se = as.numeric(se), - ci_lower = est - z * se, ci_upper = est + z * se, - weights = as.list(W_h), strata_means = as.list(yb_h), - n_strata = length(strata_names), - method = "Stratified mean (Cochran 1977)") + list( + estimate = as.numeric(est), se = as.numeric(se), + ci_lower = est - z * se, ci_upper = est + z * se, + weights = as.list(W_h), strata_means = as.list(yb_h), + n_strata = length(strata_names), + method = "Stratified mean (Cochran 1977)" + ) } # CANONICAL TEST @@ -50,4 +54,4 @@ strat <- function(data, y = "y", strata = "stratum", pop_sizes = NULL) { #' @rdname strat #' @keywords internal #' @export -stratified_sampling <- strat +morie_stratified_sampling <- strat diff --git a/r-package/morie/R/study_core.R b/r-package/morie/R/study_core.R index eba8ed58d4..776daf0b2a 100644 --- a/r-package/morie/R/study_core.R +++ b/r-package/morie/R/study_core.R @@ -3,7 +3,9 @@ } .safe_divide <- function(num, den) { - if (is.na(den) || den == 0) return(NA_real_) + if (is.na(den) || den == 0) { + return(NA_real_) + } as.numeric(num) / as.numeric(den) } @@ -128,7 +130,7 @@ stringsAsFactors = FALSE ) wrangling_log <- data.frame( - step = c("load_csv", "canonicalize_cpads_data", "validate_cpads_data"), + step = c("load_csv", "morie_canonicalize_cpads_data", "morie_validate_cpads_data"), description = c( "Read the bundled CPADS CSV from disk.", "Map raw CPADS fields to the MORIE canonical analysis contract.", @@ -136,11 +138,11 @@ ), rows_before = c(nrow(raw), nrow(raw), nrow(data)), rows_after = c(nrow(raw), nrow(data), nrow(data)), - cols_affected = c("all", paste(names(data), collapse = ","), paste(cpads_contract()$required_variables, collapse = ",")), + cols_affected = c("all", paste(names(data), collapse = ","), paste(morie_cpads_contract()$required_variables, collapse = ",")), stringsAsFactors = FALSE ) if (!is.null(output_dir)) { - project_root <- tryCatch(find_project_root(dirname(output_dir)), error = function(e) NULL) + project_root <- tryCatch(morie_find_project_root(dirname(output_dir)), error = function(e) NULL) if (!is.null(project_root)) { wrangled_dir <- file.path(project_root, "data", "private", "outputs", "wrangled") dir.create(wrangled_dir, recursive = TRUE, showWarnings = FALSE) @@ -591,7 +593,7 @@ } .run_propensity_scores_module_internal <- function(data) { - out <- run_propensity_ipw_analysis(data) + out <- morie_run_propensity_ipw_analysis(data) frame <- out$analysis_frame treated <- frame[frame$cannabis_any_use == 1, , drop = FALSE] control <- frame[frame$cannabis_any_use == 0, , drop = FALSE] @@ -618,7 +620,7 @@ ate_ipw <- prop_out$ipw_results$estimate[1] se_ipw <- prop_out$ipw_results$se[1] - out_model <- stats::glm( + out_model <- stats::glm( heavy_drinking_30d ~ cannabis_any_use + age_group_label + gender_label + province_region_label + mental_health_label + physical_health_label, data = frame, family = stats::quasibinomial(), @@ -706,17 +708,21 @@ } list( treatment_effects_summary = summary_tbl, - cate_subgroup_estimates = if (length(cate_rows) > 0L) do.call(rbind, cate_rows) else data.frame( - subgroup_var = character(), - subgroup_level = character(), - n_treated = integer(), - n_control = integer(), - cate = numeric(), - se = numeric(), - ci_lower = numeric(), - ci_upper = numeric(), - stringsAsFactors = FALSE - ) + cate_subgroup_estimates = if (length(cate_rows) > 0L) { + do.call(rbind, cate_rows) + } else { + data.frame( + subgroup_var = character(), + subgroup_level = character(), + n_treated = integer(), + n_control = integer(), + cate = numeric(), + se = numeric(), + ci_lower = numeric(), + ci_upper = numeric(), + stringsAsFactors = FALSE + ) + } ) } @@ -944,7 +950,7 @@ .run_ebac_selection_adjustment_ipw_module_internal <- function(data) { data <- .cpads_labeled_data(data) - out <- run_ebac_selection_ipw_analysis(data) + out <- morie_run_ebac_selection_ipw_analysis(data) eligible <- data[data$alcohol_past12m == 1, , drop = FALSE] eligible <- eligible[stats::complete.cases(eligible[, c("weight", "cannabis_any_use", "age_group_label", "gender_label", "province_region_label", "mental_health_label", "physical_health_label"), drop = FALSE]), , drop = FALSE] eligible$R <- as.integer(!is.na(eligible$ebac_tot)) diff --git a/r-package/morie/R/study_reporting.R b/r-package/morie/R/study_reporting.R index 1b06935a0c..d420ca116e 100644 --- a/r-package/morie/R/study_reporting.R +++ b/r-package/morie/R/study_reporting.R @@ -135,7 +135,9 @@ endpoint_df <- endpoint_df[stats::complete.cases(endpoint_df), , drop = FALSE] gsum <- do.call(rbind, lapply(levels(endpoint_df$gender_label), function(lvl) { sub <- endpoint_df[endpoint_df$gender_label == lvl, , drop = FALSE] - if (nrow(sub) == 0L) return(NULL) + if (nrow(sub) == 0L) { + return(NULL) + } est <- .weighted_binary_estimate(sub[[endpoint_name]], sub$weight) data.frame(gender = lvl, p = est$p, n = est$n, stringsAsFactors = FALSE) })) @@ -210,7 +212,7 @@ stringsAsFactors = FALSE ) } - target_required <- max(sapply(pair_rows, function(x) if (is.data.frame(x)) x$n_eq[1] else NA_real_), na.rm = TRUE) + target_required <- max(vapply(pair_rows, function(x) if (is.data.frame(x)) x$n_eq[1] else NA_real_, numeric(1)), na.rm = TRUE) if (!is.finite(target_required)) target_required <- NA_real_ target_rows[[length(target_rows) + 1L]] <- data.frame( endpoint = endpoint_name, @@ -399,7 +401,14 @@ } .legacy_reference_root <- function() { - file.path(find_project_root(), "migration_files", "one") + # The legacy migration tree exists only in a source checkout; an + # installed package has no project root. Degrade to NA so callers + # (.copy_legacy_artifacts) simply copy nothing rather than erroring. + root <- tryCatch(morie_find_project_root(), error = function(e) NA_character_) + if (is.na(root)) { + return(NA_character_) + } + file.path(root, "migration_files", "one") } .copy_legacy_artifacts <- function(relative_paths, output_dir, root = file.path(.legacy_reference_root(), "six", "outputs")) { @@ -464,7 +473,7 @@ stringsAsFactors = FALSE ) var_map <- data.frame( - variable_name = cpads_contract()$required_variables, + variable_name = morie_cpads_contract()$required_variables, user_guide_description = c( "Survey weight", "Alcohol use in the past 12 months", @@ -478,7 +487,7 @@ "Mental health", "Physical health" ), - exists_in_wrangled_data = cpads_contract()$required_variables %in% names(data), + exists_in_wrangled_data = morie_cpads_contract()$required_variables %in% names(data), coding_note = "See CPADS user guide PDF for official item wording and coding.", stringsAsFactors = FALSE ) @@ -510,7 +519,7 @@ stringsAsFactors = FALSE ), ebac_final_user_guide_variable_map = var_map, - ebac_final_variable_audit = data.frame(item = names(data), value = ifelse(names(data) %in% cpads_contract()$required_variables, "canonical", "auxiliary"), stringsAsFactors = FALSE) + ebac_final_variable_audit = data.frame(item = names(data), value = ifelse(names(data) %in% morie_cpads_contract()$required_variables, "canonical", "auxiliary"), stringsAsFactors = FALSE) ) } @@ -576,10 +585,18 @@ "Exposure: cannabis_any_use.", "See docs/source/modules/20212022-cpads-pumf-user-guide.pdf for source coding notes." ) + # The user-guide PDF lives in a source checkout only; tolerate its + # absence (and a missing project root) when run from an installed + # package rather than letting the whole report module error out. + proj_root <- tryCatch(morie_find_project_root(), error = function(e) NA_character_) + user_guide_present <- !is.na(proj_root) && file.exists(file.path( + proj_root, "docs", "source", "modules", + "20212022-cpads-pumf-user-guide.pdf" + )) audit_tbl <- data.frame( check_name = c("outputs_present", "user_guide_reference_present", "cpads_required_variables_present"), - value = c(length(output_files), file.exists(file.path(find_project_root(), "docs", "source", "modules", "20212022-cpads-pumf-user-guide.pdf")), all(cpads_contract()$required_variables %in% names(data))), - pass = c(length(output_files) > 0, file.exists(file.path(find_project_root(), "docs", "source", "modules", "20212022-cpads-pumf-user-guide.pdf")), all(cpads_contract()$required_variables %in% names(data))), + value = c(length(output_files), user_guide_present, all(morie_cpads_contract()$required_variables %in% names(data))), + pass = c(length(output_files) > 0, user_guide_present, all(morie_cpads_contract()$required_variables %in% names(data))), stringsAsFactors = FALSE ) list( diff --git a/r-package/morie/R/stvar.R b/r-package/morie/R/stvar.R index 0877fd7c2e..d4dc703826 100644 --- a/r-package/morie/R/stvar.R +++ b/r-package/morie/R/stvar.R @@ -13,28 +13,32 @@ #' counts), n, method. #' @references Cressie & Huang (1999); Schabenberger & Gotway (2005), Ch 8. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' stvar(x = rnorm(50), coords = matrix(runif(100), 50, 2), times = sort(cumsum(rexp(50)))) #' @export stvar <- function(x, coords, times, n_spatial_bins = 6, n_temporal_bins = 6, max_spatial = NULL, max_temporal = NULL) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } t <- as.numeric(times) if (nrow(coords) != n || length(t) != n) stop("shape mismatch") sd_full <- as.matrix(stats::dist(coords)) td_full <- abs(outer(t, t, "-")) iu <- which(upper.tri(sd_full), arr.ind = TRUE) - sd_f <- sd_full[iu]; td_f <- td_full[iu] - diffs2 <- (x[iu[, 1]] - x[iu[, 2]]) ^ 2 - if (is.null(max_spatial)) + sd_f <- sd_full[iu] + td_f <- td_full[iu] + diffs2 <- (x[iu[, 1]] - x[iu[, 2]])^2 + if (is.null(max_spatial)) { max_spatial <- if (max(sd_f) > 0) max(sd_f) / 2 else 1 - if (is.null(max_temporal)) + } + if (is.null(max_temporal)) { max_temporal <- if (max(td_f) > 0) max(td_f) / 2 else 1 + } s_edges <- seq(0, max_spatial, length.out = n_spatial_bins + 1) t_edges <- seq(0, max_temporal, length.out = n_temporal_bins + 1) gamma <- matrix(NA_real_, n_spatial_bins, n_temporal_bins) @@ -42,7 +46,7 @@ stvar <- function(x, coords, times, for (i in seq_len(n_spatial_bins)) { for (j in seq_len(n_temporal_bins)) { m <- sd_f > s_edges[i] & sd_f <= s_edges[i + 1] & - td_f > t_edges[j] & td_f <= t_edges[j + 1] + td_f > t_edges[j] & td_f <= t_edges[j + 1] k <- sum(m) counts[i, j] <- as.integer(k) if (k > 0) gamma[i, j] <- 0.5 * mean(diffs2[m]) @@ -50,12 +54,16 @@ stvar <- function(x, coords, times, } s_mids <- 0.5 * (s_edges[-1] + s_edges[-(n_spatial_bins + 1)]) t_mids <- 0.5 * (t_edges[-1] + t_edges[-(n_temporal_bins + 1)]) - list(estimate = list(gamma = gamma, spatial_bins = s_mids, - temporal_bins = t_mids, counts = counts), - n = n, method = "Empirical spatiotemporal variogram") + list( + estimate = list( + gamma = gamma, spatial_bins = s_mids, + temporal_bins = t_mids, counts = counts + ), + n = n, method = "Empirical spatiotemporal variogram" + ) } #' @rdname stvar #' @keywords internal #' @export -spatiotemporal_variogram <- stvar +morie_spatiotemporal_variogram <- stvar diff --git a/r-package/morie/R/sukht.R b/r-package/morie/R/sukht.R index 92a4754c84..c6047194df 100644 --- a/r-package/morie/R/sukht.R +++ b/r-package/morie/R/sukht.R @@ -9,24 +9,28 @@ #' @return Named list: statistic (z), p_value, U, n, m. #' @importFrom stats wilcox.test median #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_sukhatme_test(x = rnorm(50), y = rnorm(50)) #' @export -sukhatme_test <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y); N <- m + n +morie_sukhatme_test <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) + N <- m + n if (m < 2 || n < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, U = NA_real_, - n = N, m = m, - method = "Sukhatme scale test")) + return(list( + statistic = NA_real_, p_value = NA_real_, U = NA_real_, + n = N, m = m, + method = "Sukhatme scale test" + )) } pooled_med <- stats::median(c(x, y)) ax <- abs(x - pooled_med) ay <- abs(y - pooled_med) - tst <- suppressWarnings(stats::wilcox.test(ax, ay, exact = FALSE, - correct = FALSE)) + tst <- suppressWarnings(stats::wilcox.test(ax, ay, + exact = FALSE, + correct = FALSE + )) U <- as.numeric(tst$statistic) E_U <- m * n / 2 Var_U <- m * n * (N + 1) / 12 diff --git a/r-package/morie/R/svmge.R b/r-package/morie/R/svmge.R index 6aa5f9eb50..da210cc7e8 100644 --- a/r-package/morie/R/svmge.R +++ b/r-package/morie/R/svmge.R @@ -11,50 +11,68 @@ #' @return list(estimate, y_hat, alpha, support_indices, se, n, method). #' @references Vapnik (1995); Montesinos Lopez Ch 7. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_svm_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) #' @export -svm_genomic <- function(x, y, markers, C = 1, epsilon = 0.1, - gamma = "scale") { - y <- as.numeric(y); n <- length(y) +morie_svm_genomic <- function(x, y, markers, C = 1, epsilon = 0.1, + gamma = "scale") { + y <- as.numeric(y) + n <- length(y) M <- as.matrix(markers) - feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) M - else cbind(as.matrix(x), M) + feats <- if (is.null(x) || (is.numeric(x) && length(x) == 0)) { + M + } else { + cbind(as.matrix(x), M) + } + # A zero-variance predictor cannot be scaled and makes e1071::svm warn + # ("variable constant"); drop any constant columns first. + if (ncol(feats) > 1L) { + keep <- apply(feats, 2L, function(col) stats::var(col) > 0) + if (any(keep) && !all(keep)) feats <- feats[, keep, drop = FALSE] + } use_e <- requireNamespace("e1071", quietly = TRUE) method_used <- "Kernel-ridge RBF fallback (no e1071)" if (use_e) { method_used <- "e1071 eps-SVR (RBF)" g_val <- if (identical(gamma, "scale")) { - v <- stats::var(as.numeric(M)); if (!is.finite(v) || v <= 0) v <- 1 + v <- stats::var(as.numeric(M)) + if (!is.finite(v) || v <= 0) v <- 1 1 / (ncol(M) * v) - } else as.numeric(gamma) - fit <- e1071::svm(feats, y, type = "eps-regression", kernel = "radial", - cost = C, epsilon = epsilon, gamma = g_val) + } else { + as.numeric(gamma) + } + fit <- e1071::svm(feats, y, + type = "eps-regression", kernel = "radial", + cost = C, epsilon = epsilon, gamma = g_val + ) y_hat <- as.numeric(stats::predict(fit, feats)) alpha <- as.numeric(fit$coefs) sv_idx <- as.integer(fit$index) intercept <- as.numeric(-fit$rho) } else { g_val <- if (identical(gamma, "scale")) { - v <- stats::var(as.numeric(M)); if (!is.finite(v) || v <= 0) v <- 1 + v <- stats::var(as.numeric(M)) + if (!is.finite(v) || v <= 0) v <- 1 1 / (ncol(M) * v) - } else as.numeric(gamma) + } else { + as.numeric(gamma) + } sq <- rowSums(feats^2) D2 <- pmax(outer(sq, sq, "+") - 2 * tcrossprod(feats), 0) K <- exp(-g_val * D2) - intercept <- mean(y); yc <- y - intercept + intercept <- mean(y) + yc <- y - intercept alpha <- as.numeric(solve(K + (1 / max(C, 1e-8)) * diag(n), yc)) y_hat <- as.numeric(K %*% alpha) + intercept sv_idx <- which(abs(alpha) > 1e-6) } resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, alpha = alpha, - support_indices = sv_idx, intercept = intercept, - se = sqrt(mean(resid^2)), n = n, method = method_used) + list( + estimate = mean(y_hat), y_hat = y_hat, alpha = alpha, + support_indices = sv_idx, intercept = intercept, + se = sqrt(mean(resid^2)), n = n, method = method_used + ) } # CANONICAL TEST # set.seed(12); M <- matrix(rnorm(100), 25, 4); y <- sin(M[,1])+0.2*rnorm(25) -# svm_genomic(rep(0, 25), y, M) +# morie_svm_genomic(rep(0, 25), y, M) diff --git a/r-package/morie/R/svmhg.R b/r-package/morie/R/svmhg.R index 095f43ddc7..47e73b9a23 100644 --- a/r-package/morie/R/svmhg.R +++ b/r-package/morie/R/svmhg.R @@ -12,19 +12,18 @@ #' classes, n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -svm_hinge_primal <- function(x, y, C = 1.0, seed = 0L) { +morie_svm_hinge_primal <- function(x, y, C = 1.0, seed = 0L) { if (!requireNamespace("e1071", quietly = TRUE)) { - stop("Function 'svm_hinge_primal' requires package 'e1071'. Install with install.packages('e1071').") + stop("Function 'morie_svm_hinge_primal' requires package 'e1071'. Install with install.packages('e1071').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.factor(y) + x <- as.matrix(x) + y <- as.factor(y) classes <- levels(y) - if (length(classes) != 2) stop("svm_hinge_primal requires binary y") + if (length(classes) != 2) stop("morie_svm_hinge_primal requires binary y") set.seed(seed) fit <- e1071::svm(x = x, y = y, kernel = "linear", cost = C, scale = FALSE) # Reconstruct w = sum_i alpha_i y_i x_i (libsvm sign convention: coefs are alpha_i*y_i) diff --git a/r-package/morie/R/svmkr.R b/r-package/morie/R/svmkr.R index 7b68c2ab83..8ef33064e4 100644 --- a/r-package/morie/R/svmkr.R +++ b/r-package/morie/R/svmkr.R @@ -15,24 +15,24 @@ #' gamma, degree, n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_svm_kernel_trick(x = rnorm(50), y = rnorm(50)) #' @export -svm_kernel_trick <- function(x, y, kernel = "rbf", C = 1.0, - gamma = "scale", degree = 3L, seed = 0L) { +morie_svm_kernel_trick <- function(x, y, kernel = "rbf", C = 1.0, + gamma = "scale", degree = 3L, seed = 0L) { + x <- .morie_ensure_design_matrix(x) if (!requireNamespace("e1071", quietly = TRUE)) { - stop("Function 'svm_kernel_trick' requires package 'e1071'. Install with install.packages('e1071').") + stop("Function 'morie_svm_kernel_trick' requires package 'e1071'. Install with install.packages('e1071').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) - x <- as.matrix(x); y <- as.factor(y) + x <- as.matrix(x) + y <- as.factor(y) p <- ncol(x) e1071_kernel <- switch(kernel, - rbf = "radial", - poly = "polynomial", - sigmoid = "sigmoid", - linear = "linear") + rbf = "radial", + poly = "polynomial", + sigmoid = "sigmoid", + linear = "linear" + ) if (identical(gamma, "scale")) { g <- 1 / (p * stats::var(as.numeric(x))) } else if (identical(gamma, "auto")) { @@ -41,8 +41,10 @@ svm_kernel_trick <- function(x, y, kernel = "rbf", C = 1.0, g <- as.numeric(gamma) } set.seed(seed) - fit <- e1071::svm(x = x, y = y, kernel = e1071_kernel, cost = C, - gamma = g, degree = degree, scale = FALSE) + fit <- e1071::svm( + x = x, y = y, kernel = e1071_kernel, cost = C, + gamma = g, degree = degree, scale = FALSE + ) preds <- predict(fit, x) acc <- mean(preds == y) list( diff --git a/r-package/morie/R/swigl.R b/r-package/morie/R/swigl.R index e58d1cfe24..67e7e05ce8 100644 --- a/r-package/morie/R/swigl.R +++ b/r-package/morie/R/swigl.R @@ -12,7 +12,8 @@ swiglu_activation <- function(x, W = NULL, V = NULL, b = NULL, c = NULL) { if (is.null(W) && is.null(V)) { d_out <- ncol(as.matrix(x)) - W <- diag(d_out); V <- diag(d_out) + W <- diag(d_out) + V <- diag(d_out) } else if (xor(is.null(W), is.null(V))) { stop("Provide both W and V or neither") } diff --git a/r-package/morie/R/synthetic.R b/r-package/morie/R/synthetic.R index 6b1cb310b4..55304992e3 100644 --- a/r-package/morie/R/synthetic.R +++ b/r-package/morie/R/synthetic.R @@ -3,7 +3,9 @@ inv_logit <- function(x) { } inject_special_codes <- function(x, rate = 0.02, codes = c(97L, 98L, 99L, 997L, 998L, 999L)) { - if (rate <= 0) return(x) + if (rate <= 0) { + return(x) + } n <- length(x) idx <- stats::runif(n) < rate if (any(idx)) { @@ -25,7 +27,7 @@ resolve_synthetic_name_map <- function(name_map, profile) { required <- synthetic_required_keys() if (is.null(name_map)) { - return(default_synthetic_name_map(profile = profile)) + return(morie_default_synthetic_name_map(profile = profile)) } if (is.list(name_map)) { @@ -58,13 +60,15 @@ resolve_synthetic_name_map <- function(name_map, profile) { #' Default synthetic-data variable name map #' #' Returns a named character vector mapping canonical variable keys used by -#' [generate_synthetic_data()] to output column names. +#' [morie_generate_synthetic_data()] to output column names. #' #' @param profile Name profile. `"generic"` is recommended for new projects. #' `"morie_legacy"` reproduces previous EML legacy column names. #' @return Named character vector. +#' @examples +#' morie_default_synthetic_name_map("generic") #' @export -default_synthetic_name_map <- function(profile = c("generic", "morie_legacy")) { +morie_default_synthetic_name_map <- function(profile = c("generic", "morie_legacy")) { profile <- match.arg(profile) if (identical(profile, "generic")) { @@ -120,10 +124,13 @@ default_synthetic_name_map <- function(profile = c("generic", "morie_legacy")) { #' @param profile Convenience profile for output naming; ignored when #' `name_map` is supplied. #' @param name_map Optional named character vector mapping canonical keys to -#' output column names. Use [default_synthetic_name_map()] as a template. +#' output column names. Use [morie_default_synthetic_name_map()] as a template. #' @return A data.frame with synthetic records. +#' @examples +#' df <- morie_generate_synthetic_data(n = 200, seed = 1) +#' nrow(df) #' @export -generate_synthetic_data <- function( +morie_generate_synthetic_data <- function( n = 5000L, seed = 42L, special_code_rate = 0.02, @@ -135,7 +142,7 @@ generate_synthetic_data <- function( stop("`n` must be an integer >= 100.", call. = FALSE) } if (!is.numeric(special_code_rate) || length(special_code_rate) != 1 || - is.na(special_code_rate) || special_code_rate < 0 || special_code_rate > 0.2) { + is.na(special_code_rate) || special_code_rate < 0 || special_code_rate > 0.2) { stop("`special_code_rate` must be in [0, 0.2].", call. = FALSE) } @@ -247,8 +254,11 @@ generate_synthetic_data <- function( #' @param name_map Optional custom variable name map. #' @param overwrite If `TRUE`, overwrite existing file. #' @return Normalized output path. +#' @examples +#' out <- morie_write_synthetic_data(tempfile(fileext = ".csv"), n = 200, seed = 1) +#' file.exists(out) #' @export -write_synthetic_data <- function( +morie_write_synthetic_data <- function( path, n = 5000L, seed = 42L, @@ -264,7 +274,7 @@ write_synthetic_data <- function( stop("File already exists. Set `overwrite = TRUE` to replace it.", call. = FALSE) } - dat <- generate_synthetic_data( + dat <- morie_generate_synthetic_data( n = n, seed = seed, special_code_rate = special_code_rate, diff --git a/r-package/morie/R/tarmd.R b/r-package/morie/R/tarmd.R index 2a3821bae3..72a2109431 100644 --- a/r-package/morie/R/tarmd.R +++ b/r-package/morie/R/tarmd.R @@ -9,42 +9,52 @@ #' @return Named list with \code{threshold, phi_lower, phi_upper, p, d, #' regime_sizes, sse, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_threshold_autoregression(x = rnorm(50)) #' @export -threshold_autoregression <- function(x, p = 1, d = 1, n_grid = 50) { - y <- as.numeric(x); n <- length(y); start <- max(p, d) - if (n - start < 4 * (p + 1)) +morie_threshold_autoregression <- function(x, p = 1, d = 1, n_grid = 50) { + y <- as.numeric(x) + n <- length(y) + start <- max(p, d) + if (n - start < 4 * (p + 1)) { stop("Series too short for SETAR(p, d).") + } Y <- y[(start + 1):n] - X <- cbind(1, do.call(cbind, - lapply(seq_len(p), function(i) y[(start - i + 1):(n - i)]))) + X <- cbind(1, do.call( + cbind, + lapply(seq_len(p), function(i) y[(start - i + 1):(n - i)]) + )) Z <- y[(start - d + 1):(n - d)] ql <- as.numeric(quantile(Z, 0.15)) qh <- as.numeric(quantile(Z, 0.85)) grid <- seq(ql, qh, length.out = n_grid) - best <- list(sse = Inf, threshold = NA, phi_lo = NULL, phi_hi = NULL, - sizes = NULL) + best <- list( + sse = Inf, threshold = NA, phi_lo = NULL, phi_hi = NULL, + sizes = NULL + ) for (c in grid) { - lo <- Z <= c; hi <- !lo + lo <- Z <= c + hi <- !lo if (sum(lo) < 2 * (p + 1) || sum(hi) < 2 * (p + 1)) next phi_lo <- lsfit(X[lo, , drop = FALSE], Y[lo], intercept = FALSE)$coef phi_hi <- lsfit(X[hi, , drop = FALSE], Y[hi], intercept = FALSE)$coef sse <- sum((Y[lo] - X[lo, , drop = FALSE] %*% phi_lo)^2) + - sum((Y[hi] - X[hi, , drop = FALSE] %*% phi_hi)^2) + sum((Y[hi] - X[hi, , drop = FALSE] %*% phi_hi)^2) if (sse < best$sse) { - best <- list(sse = sse, threshold = c, - phi_lo = phi_lo, phi_hi = phi_hi, - sizes = c(lower = sum(lo), upper = sum(hi))) + best <- list( + sse = sse, threshold = c, + phi_lo = phi_lo, phi_hi = phi_hi, + sizes = c(lower = sum(lo), upper = sum(hi)) + ) } } - if (is.null(best$phi_lo)) + if (is.null(best$phi_lo)) { stop("Could not find admissible threshold grid point.") - list(threshold = best$threshold, - phi_lower = best$phi_lo, phi_upper = best$phi_hi, - p = p, d = d, regime_sizes = best$sizes, - sse = best$sse, n = n, - method = sprintf("SETAR(p=%d, d=%d) via grid-search OLS", p, d)) + } + list( + threshold = best$threshold, + phi_lower = best$phi_lo, phi_upper = best$phi_hi, + p = p, d = d, regime_sizes = best$sizes, + sse = best$sse, n = n, + method = sprintf("SETAR(p=%d, d=%d) via grid-search OLS", p, d) + ) } diff --git a/r-package/morie/R/tgrch.R b/r-package/morie/R/tgrch.R index 470bd8de49..4e6caa4542 100644 --- a/r-package/morie/R/tgrch.R +++ b/r-package/morie/R/tgrch.R @@ -1,18 +1,39 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: GJR-GARCH(1,1) Gaussian negative log-likelihood for the +# base-R fallback. Extracted from the morie_tgarch_model() optimiser closure +# so the parameter-domain guard is directly unit-testable. +.tgarch_negll <- function(p, r, n) { + omega <- p[1] + alpha <- p[2] + gamma <- p[3] + beta <- p[4] + if (omega <= 0 || alpha < 0 || beta < 0 || alpha + 0.5 * gamma + beta >= 1) { + return(1e10) + } + s2 <- numeric(n) + s2[1] <- var(r) + 1e-10 + for (t in 2:n) { + I <- if (r[t - 1] < 0) 1 else 0 + s2[t] <- max( + omega + (alpha + gamma * I) * r[t - 1]^2 + beta * s2[t - 1], + 1e-12 + ) + } + 0.5 * sum(log(2 * pi * s2) + r^2 / s2) +} + #' GJR-GARCH(1,1) threshold GARCH #' -#' @inheritParams garch_fit +#' @inheritParams morie_garch_fit #' @return Named list with \code{omega, alpha, gamma, beta, persistence, #' loglik, conditional_variance, n, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_tgarch_model(x = rnorm(50)) #' @export -tgarch_model <- function(x) { - r <- as.numeric(x) - mean(as.numeric(x)); n <- length(r) +morie_tgarch_model <- function(x) { + r <- as.numeric(x) - mean(as.numeric(x)) + n <- length(r) if (n < 20) stop("Need >=20 obs.") if (requireNamespace("rugarch", quietly = TRUE)) { spec <- rugarch::ugarchspec( @@ -21,42 +42,39 @@ tgarch_model <- function(x) { ) fit <- rugarch::ugarchfit(spec, r, solver = "hybrid") p <- rugarch::coef(fit) - return(list(omega = unname(p["omega"]), - alpha = unname(p["alpha1"]), - gamma = unname(p["gamma1"]), - beta = unname(p["beta1"]), - persistence = unname(p["alpha1"] + 0.5 * p["gamma1"] + p["beta1"]), - loglik = as.numeric(rugarch::likelihood(fit)), - conditional_variance = as.numeric(rugarch::sigma(fit))^2, - n = n, - method = "GJR-GARCH(1,1) via rugarch")) - } - neg_ll <- function(p) { - omega <- p[1]; alpha <- p[2]; gamma <- p[3]; beta <- p[4] - if (omega <= 0 || alpha < 0 || beta < 0 || alpha + 0.5 * gamma + beta >= 1) - return(1e10) - s2 <- numeric(n); s2[1] <- var(r) + 1e-10 - for (t in 2:n) { - I <- if (r[t - 1] < 0) 1 else 0 - s2[t] <- max(omega + (alpha + gamma * I) * r[t - 1]^2 + beta * s2[t - 1], - 1e-12) - } - 0.5 * sum(log(2 * pi * s2) + r^2 / s2) + return(list( + omega = unname(p["omega"]), + alpha = unname(p["alpha1"]), + gamma = unname(p["gamma1"]), + beta = unname(p["beta1"]), + persistence = unname(p["alpha1"] + 0.5 * p["gamma1"] + p["beta1"]), + loglik = as.numeric(rugarch::likelihood(fit)), + conditional_variance = as.numeric(rugarch::sigma(fit))^2, + n = n, + method = "GJR-GARCH(1,1) via rugarch" + )) } + neg_ll <- function(p) .tgarch_negll(p, r, n) var_r <- var(r) opt <- nlminb(c(var_r * 0.05, 0.05, 0.05, 0.85), neg_ll, - lower = c(1e-8, 1e-8, -0.5, 1e-8), - upper = c(var_r * 10, 0.5, 0.999, 0.999)) - omega <- opt$par[1]; alpha <- opt$par[2] - gamma <- opt$par[3]; beta <- opt$par[4] - s2 <- numeric(n); s2[1] <- var_r + lower = c(1e-8, 1e-8, -0.5, 1e-8), + upper = c(var_r * 10, 0.5, 0.999, 0.999) + ) + omega <- opt$par[1] + alpha <- opt$par[2] + gamma <- opt$par[3] + beta <- opt$par[4] + s2 <- numeric(n) + s2[1] <- var_r for (t in 2:n) { I <- if (r[t - 1] < 0) 1 else 0 s2[t] <- omega + (alpha + gamma * I) * r[t - 1]^2 + beta * s2[t - 1] } - list(omega = omega, alpha = alpha, gamma = gamma, beta = beta, - persistence = alpha + 0.5 * gamma + beta, - loglik = -opt$objective, - conditional_variance = s2, n = n, - method = "GJR-GARCH(1,1) Gaussian MLE (base R)") + list( + omega = omega, alpha = alpha, gamma = gamma, beta = beta, + persistence = alpha + 0.5 * gamma + beta, + loglik = -opt$objective, + conditional_variance = s2, n = n, + method = "GJR-GARCH(1,1) Gaussian MLE (base R)" + ) } diff --git a/r-package/morie/R/thfdt.R b/r-package/morie/R/thfdt.R index d4a5d12789..9e9117195a 100644 --- a/r-package/morie/R/thfdt.R +++ b/r-package/morie/R/thfdt.R @@ -4,36 +4,38 @@ #' (Gibbons Ch 8.3.1) #' #' Replaces pooled ranks with Blom-approximated normal scores -#' a_i = qnorm((R_i - 3/8) / (N + 1/4)). Statistic T = sum of +#' a_i = qnorm((R_i - 3/8) / (N + 1/4)). Statistic stat_t = sum of #' scores from the first sample. #' #' @param x,y Numeric vectors. #' @return Named list: statistic, p_value, z, n, m. #' @importFrom stats qnorm pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_terry_hoeffding_test(x = rnorm(50), y = rnorm(50)) #' @export -terry_hoeffding_test <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y); N <- m + n +morie_terry_hoeffding_test <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) + N <- m + n if (m < 2 || n < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, z = NA_real_, - n = N, m = m, - method = "Terry-Hoeffding (Fisher-Yates) normal-scores test")) + return(list( + statistic = NA_real_, p_value = NA_real_, z = NA_real_, + n = N, m = m, + method = "Terry-Hoeffding (Fisher-Yates) normal-scores test" + )) } pooled <- c(x, y) ranks <- rank(pooled) - a <- stats::qnorm((ranks - 3/8) / (N + 1/4)) - T <- sum(a[1:m]) + a <- stats::qnorm((ranks - 3 / 8) / (N + 1 / 4)) + stat_t <- sum(a[1:m]) sum_a2 <- sum(a^2) Var_T <- (m * n / (N * (N - 1))) * sum_a2 - z <- T / sqrt(Var_T) + z <- stat_t / sqrt(Var_T) p <- 2 * (1 - stats::pnorm(abs(z))) list( - statistic = T, + statistic = stat_t, p_value = p, z = z, n = N, diff --git a/r-package/morie/R/tknbp.R b/r-package/morie/R/tknbp.R index da8f24c89d..d7885e818f 100644 --- a/r-package/morie/R/tknbp.R +++ b/r-package/morie/R/tknbp.R @@ -7,22 +7,28 @@ #' @return Named list with merges, vocab, corpus, n_merges, n_vocab, method. #' @keywords internal bpe_tokenizer <- function(x, num_merges = 10L) { - if (length(x) == 1L && is.character(x) && grepl("\\s", x)) + if (length(x) == 1L && is.character(x) && grepl("\\s", x)) { words <- strsplit(x, "\\s+")[[1L]] - else + } else { words <- as.character(x) - if (!length(words)) - return(list(merges = list(), vocab = character(0), - n_merges = 0L, n_vocab = 0L, method = "BPE")) + } + if (!length(words)) { + return(list( + merges = list(), vocab = character(0), + n_merges = 0L, n_vocab = 0L, method = "BPE" + )) + } tab <- table(words) - corpus <- lapply(names(tab), function(w) - c(strsplit(w, "")[[1L]], "")) + corpus <- lapply(names(tab), function(w) { + c(strsplit(w, "")[[1L]], "") + }) freq <- as.integer(tab) merges <- list() for (m in seq_len(num_merges)) { pair_counts <- list() for (k in seq_along(corpus)) { - sym <- corpus[[k]]; f <- freq[[k]] + sym <- corpus[[k]] + f <- freq[[k]] if (length(sym) < 2L) next for (i in seq_len(length(sym) - 1L)) { key <- paste(sym[i], sym[i + 1L], sep = "\x1f") @@ -35,19 +41,28 @@ bpe_tokenizer <- function(x, num_merges = 10L) { merges[[length(merges) + 1L]] <- best # merge in corpus corpus <- lapply(corpus, function(sym) { - if (length(sym) < 2L) return(sym) - out <- character(0); i <- 1L + if (length(sym) < 2L) { + return(sym) + } + out <- character(0) + i <- 1L while (i <= length(sym)) { if (i < length(sym) && - sym[i] == best[1L] && sym[i + 1L] == best[2L]) { - out <- c(out, paste0(best[1L], best[2L])); i <- i + 2L - } else { out <- c(out, sym[i]); i <- i + 1L } + sym[i] == best[1L] && sym[i + 1L] == best[2L]) { + out <- c(out, paste0(best[1L], best[2L])) + i <- i + 2L + } else { + out <- c(out, sym[i]) + i <- i + 1L + } } out }) } vocab <- unique(unlist(corpus)) - list(merges = merges, vocab = vocab, corpus = corpus, - n_merges = length(merges), n_vocab = length(vocab), - method = "BPE") + list( + merges = merges, vocab = vocab, corpus = corpus, + n_merges = length(merges), n_vocab = length(vocab), + method = "BPE" + ) } diff --git a/r-package/morie/R/tmpsc.R b/r-package/morie/R/tmpsc.R index 033d68f5e6..fbd65db3b6 100644 --- a/r-package/morie/R/tmpsc.R +++ b/r-package/morie/R/tmpsc.R @@ -3,14 +3,15 @@ #' Temperature-scaled softmax (Hinton 2015) #' #' @param x Numeric vector of logits. -#' @param T Numeric softmax temperature > 0 (default 1). -#' @return Named list with tensor, entropy, T, method. +#' @param temperature Numeric softmax temperature > 0 (default 1). +#' @return Named list with tensor, entropy, temperature, method. #' @keywords internal -temperature_scaling <- function(x, T = 1) { - if (T <= 0) stop("Temperature must be > 0") - z <- as.numeric(x) / T +temperature_scaling <- function(x, temperature = 1) { + if (temperature <= 0) stop("Temperature must be > 0") + z <- as.numeric(x) / temperature z <- z - max(z) - p <- exp(z); p <- p / sum(p) + p <- exp(z) + p <- p / sum(p) H <- -sum(ifelse(p > 0, p * log(p), 0)) - list(tensor = p, entropy = H, T = T, method = "temperature-softmax") + list(tensor = p, entropy = H, temperature = temperature, method = "temperature-softmax") } diff --git a/r-package/morie/R/tolim.R b/r-package/morie/R/tolim.R index f7fd086dc7..84e2bbcf1a 100644 --- a/r-package/morie/R/tolim.R +++ b/r-package/morie/R/tolim.R @@ -17,15 +17,17 @@ #' @references Wilks (1941); Gibbons & Chakraborti (6e) Ch 2.11. #' @export #' @examples -#' tolerance_limits(1:100, coverage = 0.90, confidence = 0.95) -tolerance_limits <- function(x, coverage = 0.90, confidence = 0.95) { +#' morie_tolerance_limits(1:100, coverage = 0.90, confidence = 0.95) +morie_tolerance_limits <- function(x, coverage = 0.90, confidence = 0.95) { x <- as.numeric(x) n <- length(x) if (n < 2) { - return(list(lower = NA_real_, upper = NA_real_, - coverage_requested = coverage, - confidence_achieved = NA_real_, n = n, - method = "Distribution-free tolerance limits (Wilks)")) + return(list( + lower = NA_real_, upper = NA_real_, + coverage_requested = coverage, + confidence_achieved = NA_real_, n = n, + method = "Distribution-free tolerance limits (Wilks)" + )) } beta <- coverage conf_ach <- 1 - n * beta^(n - 1) + (n - 1) * beta^n diff --git a/r-package/morie/R/topkd.R b/r-package/morie/R/topkd.R index 5f4b22402f..54fbe876a9 100644 --- a/r-package/morie/R/topkd.R +++ b/r-package/morie/R/topkd.R @@ -4,18 +4,21 @@ #' #' @param x Numeric vector of logits. #' @param k Integer truncation rank (default 5). -#' @param T Numeric softmax temperature (default 1). +#' @param temperature Numeric softmax temperature (default 1). #' @return Named list with tensor, topk_indices, topk_logits, k, method. #' @keywords internal -top_k_decoding <- function(x, k = 5L, T = 1) { - z <- as.numeric(x) / T +top_k_decoding <- function(x, k = 5L, temperature = 1) { + z <- as.numeric(x) / temperature Vlen <- length(z) k <- max(1L, min(as.integer(k), Vlen)) thresh <- sort(z, decreasing = TRUE)[k] z_f <- ifelse(z >= thresh, z, -Inf) z_f <- z_f - max(z_f) - e <- exp(z_f); p <- e / sum(e) + e <- exp(z_f) + p <- e / sum(e) topk_idx <- order(z, decreasing = TRUE)[seq_len(k)] - list(tensor = p, topk_indices = topk_idx - 1L, - topk_logits = z[topk_idx], k = k, method = "top-k") + list( + tensor = p, topk_indices = topk_idx - 1L, + topk_logits = z[topk_idx], k = k, method = "top-k" + ) } diff --git a/r-package/morie/R/toppd.R b/r-package/morie/R/toppd.R index fea7cec531..5fdb5434b2 100644 --- a/r-package/morie/R/toppd.R +++ b/r-package/morie/R/toppd.R @@ -4,13 +4,15 @@ #' #' @param x Numeric vector of logits. #' @param p Numeric nucleus mass cutoff in (0, 1] (default 0.9). -#' @param T Numeric softmax temperature (default 1). +#' @param temperature Numeric softmax temperature (default 1). #' @return Named list with tensor, keep_mask, n_kept, p, method. #' @keywords internal -top_p_nucleus <- function(x, p = 0.9, T = 1) { +top_p_nucleus <- function(x, p = 0.9, temperature = 1) { if (p <= 0 || p > 1) stop("p must be in (0, 1]") - z <- as.numeric(x) / T; z <- z - max(z) - probs <- exp(z); probs <- probs / sum(probs) + z <- as.numeric(x) / temperature + z <- z - max(z) + probs <- exp(z) + probs <- probs / sum(probs) ord <- order(-probs) cs <- cumsum(probs[ord]) cutoff <- max(1L, min(which(cs >= p)[1L], length(probs))) @@ -18,6 +20,8 @@ top_p_nucleus <- function(x, p = 0.9, T = 1) { keep[ord[seq_len(cutoff)]] <- TRUE filtered <- ifelse(keep, probs, 0) filtered <- filtered / sum(filtered) - list(tensor = filtered, keep_mask = keep, n_kept = sum(keep), - p = p, method = "top-p") + list( + tensor = filtered, keep_mask = keep, n_kept = sum(keep), + p = p, method = "top-p" + ) } diff --git a/r-package/morie/R/tpspn.R b/r-package/morie/R/tpspn.R index d07732cf72..ec16ca45e3 100644 --- a/r-package/morie/R/tpspn.R +++ b/r-package/morie/R/tpspn.R @@ -12,28 +12,36 @@ #' @keywords internal tpspn <- function(x, y, lam = 0) { if (!is.matrix(x)) x <- matrix(x, ncol = 1) - y <- as.numeric(y); n <- nrow(x); d <- ncol(x) - if (n < d + 2L || length(y) != n) + y <- as.numeric(y) + n <- nrow(x) + d <- ncol(x) + if (n < d + 2L || length(y) != n) { return(list(estimate = NA_real_, n = n, method = "TPS (n too small)")) + } R <- as.matrix(stats::dist(x)) phi <- function(r) ifelse(r > 0, r^2 * log(r), 0) K <- phi(R) + lam * diag(n) Tmat <- cbind(1, x) - A <- rbind(cbind(K, Tmat), - cbind(t(Tmat), matrix(0, d + 1, d + 1))) + A <- rbind( + cbind(K, Tmat), + cbind(t(Tmat), matrix(0, d + 1, d + 1)) + ) rhs <- c(y, rep(0, d + 1)) sol <- as.numeric(MASS::ginv(A) %*% rhs) a <- sol[seq_len(n)] beta <- sol[(n + 1):length(sol)] fitted <- as.numeric(K %*% a + Tmat %*% beta) resid <- y - fitted - sse <- sum(resid^2); sst <- sum((y - mean(y))^2) + sse <- sum(resid^2) + sst <- sum((y - mean(y))^2) r2 <- if (sst > 0) 1 - sse / sst else NA_real_ - list(a = a, beta = beta, fitted = fitted, residuals = resid, - sse = sse, r2 = as.numeric(r2), lambda = lam, - estimate = mean(fitted), - n = as.integer(n), d = as.integer(d), - method = "Thin-plate spline (Duchon 1977)") + list( + a = a, beta = beta, fitted = fitted, residuals = resid, + sse = sse, r2 = as.numeric(r2), lambda = lam, + estimate = mean(fitted), + n = as.integer(n), d = as.integer(d), + method = "Thin-plate spline (Duchon 1977)" + ) } # CANONICAL TEST @@ -44,4 +52,4 @@ tpspn <- function(x, y, lam = 0) { #' @rdname tpspn #' @keywords internal #' @export -thin_plate_spline <- tpspn +morie_thin_plate_spline <- tpspn diff --git a/r-package/morie/R/trfbl.R b/r-package/morie/R/trfbl.R index 56b0af78be..cfa53202e3 100644 --- a/r-package/morie/R/trfbl.R +++ b/r-package/morie/R/trfbl.R @@ -18,20 +18,21 @@ #' @return Named list \code{(output, estimate, h1, num_heads, d_ff, method)}. #' @references Vaswani et al. (2017), NeurIPS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -trfbl_transformer_block <- function(x, num_heads = 2L, d_ff = NULL, +morie_trfbl_transformer_block <- function(x, num_heads = 2L, d_ff = NULL, seed = 0L, deterministic_seed = NULL) { x <- as.matrix(x) - seq_len <- nrow(x); d_model <- ncol(x) + seq_len <- nrow(x) + d_model <- ncol(x) if (is.null(d_ff)) d_ff <- 4L * d_model - attn <- mhatf_multi_head_attention_full(x, num_heads = num_heads, seed = seed, - deterministic_seed = deterministic_seed) + attn <- morie_mhatf_multi_head_attention_full(x, + num_heads = num_heads, seed = seed, + deterministic_seed = deterministic_seed + ) h1 <- .trfbl_layer_norm(x + attn$output) if (!is.null(deterministic_seed)) { @@ -44,10 +45,12 @@ trfbl_transformer_block <- function(x, num_heads = 2L, d_ff = NULL, ffn <- .trfbl_gelu(h1 %*% W1) %*% W2 h2 <- .trfbl_layer_norm(h1 + ffn) - list(output = h2, estimate = h2, h1 = h1, - num_heads = as.integer(num_heads), - d_ff = as.integer(d_ff), - method = "Transformer encoder block (post-LN)") + list( + output = h2, estimate = h2, h1 = h1, + num_heads = as.integer(num_heads), + d_ff = as.integer(d_ff), + method = "Transformer encoder block (post-LN)" + ) } .trfbl_layer_norm <- function(x, eps = 1e-5) { @@ -60,7 +63,7 @@ trfbl_transformer_block <- function(x, num_heads = 2L, d_ff = NULL, 0.5 * z * (1 + tanh(sqrt(2 / pi) * (z + 0.044715 * z^3))) } -#' @rdname trfbl_transformer_block +#' @rdname morie_trfbl_transformer_block #' @keywords internal #' @export -transformer_block <- trfbl_transformer_block +morie_transformer_block <- morie_trfbl_transformer_block diff --git a/r-package/morie/R/trfge.R b/r-package/morie/R/trfge.R index 29b905f149..16c01d5021 100644 --- a/r-package/morie/R/trfge.R +++ b/r-package/morie/R/trfge.R @@ -17,43 +17,54 @@ #' @return list(estimate, y_hat, beta, attention, context, se, n, method). #' @references Vaswani et al. (2017). Montesinos Lopez Ch 15. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_transformer_genomic( +#' x = rnorm(50), y = rnorm(50), +#' markers = matrix(sample(0:2, 200, TRUE), 50, 4) +#' ) #' @export -transformer_genomic <- function(x, y, markers, d_model = 8, lam = 1, seed = 0, - deterministic_seed = NULL) { +morie_transformer_genomic <- function(x, y, markers, d_model = 8, lam = 1, seed = 0, + deterministic_seed = NULL) { if (!is.null(deterministic_seed)) { morie::morie_det_rng("trfge", deterministic_seed) } else { set.seed(seed) } - y <- as.numeric(y); n <- length(y) - M <- as.matrix(markers); L <- ncol(M) - M_mu <- colMeans(M); M_sd <- apply(M, 2, stats::sd); M_sd[M_sd == 0] <- 1 + y <- as.numeric(y) + n <- length(y) + M <- as.matrix(markers) + L <- ncol(M) + M_mu <- colMeans(M) + M_sd <- apply(M, 2, stats::sd) + M_sd[M_sd == 0] <- 1 Ms <- sweep(sweep(M, 2, M_mu), 2, M_sd, "/") sc <- 1 / sqrt(d_model) W_emb <- matrix(stats::rnorm(d_model, 0, sc), 1, d_model) W_Q <- matrix(stats::rnorm(d_model^2, 0, sc), d_model, d_model) W_K <- matrix(stats::rnorm(d_model^2, 0, sc), d_model, d_model) W_V <- matrix(stats::rnorm(d_model^2, 0, sc), d_model, d_model) - pos <- seq_len(L) - 1; dim_idx <- seq_len(d_model) - 1 + pos <- seq_len(L) - 1 + dim_idx <- seq_len(d_model) - 1 div <- 10000^((2 * (dim_idx %/% 2)) / d_model) pe <- matrix(0, L, d_model) for (i in seq_len(d_model)) { - if (i %% 2 == 1) pe[, i] <- sin(pos / div[i]) - else pe[, i] <- cos(pos / div[i]) + if (i %% 2 == 1) { + pe[, i] <- sin(pos / div[i]) + } else { + pe[, i] <- cos(pos / div[i]) + } } softmax_row <- function(S) { Sx <- S - apply(S, 1, max) - e <- exp(Sx); sweep(e, 1, rowSums(e), "/") + e <- exp(Sx) + sweep(e, 1, rowSums(e), "/") } context <- matrix(0, n, d_model) attention <- array(0, dim = c(n, L, L)) for (i in seq_len(n)) { E <- matrix(Ms[i, ], L, 1) %*% W_emb + pe - Q <- E %*% W_Q; K <- E %*% W_K; V <- E %*% W_V + Q <- E %*% W_Q + K <- E %*% W_K + V <- E %*% W_V scores <- Q %*% t(K) / sqrt(d_model) A <- softmax_row(scores) attention[i, , ] <- A @@ -61,20 +72,24 @@ transformer_genomic <- function(x, y, markers, d_model = 8, lam = 1, seed = 0, } feats <- cbind(1, context) if (!is.null(x)) { - xa <- as.matrix(x); if (ncol(xa) >= 1 && length(xa) > 0) + xa <- as.matrix(x) + if (ncol(xa) >= 1 && length(xa) > 0) { feats <- cbind(feats, xa) + } } Amat <- crossprod(feats) + lam * diag(ncol(feats)) Amat[1, 1] <- Amat[1, 1] - lam beta <- as.numeric(solve(Amat, crossprod(feats, y))) y_hat <- as.numeric(feats %*% beta) resid <- y - y_hat - list(estimate = mean(y_hat), y_hat = y_hat, beta = beta, - attention = attention, context = context, - se = sqrt(mean(resid^2)), n = n, - method = "Transformer 1-head random-projection + ridge head") + list( + estimate = mean(y_hat), y_hat = y_hat, beta = beta, + attention = attention, context = context, + se = sqrt(mean(resid^2)), n = n, + method = "Transformer 1-head random-projection + ridge head" + ) } # CANONICAL TEST # set.seed(9); M <- matrix(rnorm(72), 12, 6); y <- M[,3] + 0.2*rnorm(12) -# transformer_genomic(rep(0,12), y, M, seed=9) +# morie_transformer_genomic(rep(0,12), y, M, seed=9) diff --git a/r-package/morie/R/tsnrd.R b/r-package/morie/R/tsnrd.R index 53d2dc6945..81d3473aa5 100644 --- a/r-package/morie/R/tsnrd.R +++ b/r-package/morie/R/tsnrd.R @@ -17,17 +17,19 @@ #' @return Named list: estimate (shape), embedding, kl_divergence, #' perplexity, n_components, n, method. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -tsne_reduction <- function(x, n_components = 2L, perplexity = 30, - learning_rate = "auto", n_iter = 1000L, - seed = 0L, - deterministic_seed = NULL) { +morie_tsne_reduction <- function(x, n_components = 2L, perplexity = 30, + learning_rate = "auto", n_iter = 1000L, + seed = 0L, + deterministic_seed = NULL) { + x <- .morie_ensure_design_matrix(x) + n_rows <- nrow(as.matrix(x)) + max_perplexity <- max(1, floor((n_rows - 1) / 3)) + if (perplexity > max_perplexity) perplexity <- max_perplexity if (!requireNamespace("Rtsne", quietly = TRUE)) { - stop("Function 'tsne_reduction' requires package 'Rtsne'. Install with install.packages('Rtsne').") + stop("Function 'morie_tsne_reduction' requires package 'Rtsne'. Install with install.packages('Rtsne').") } if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) @@ -37,9 +39,11 @@ tsne_reduction <- function(x, n_components = 2L, perplexity = 30, } else { set.seed(seed) } - ts <- Rtsne::Rtsne(x, dims = n_components, perplexity = perplexity, - max_iter = n_iter, check_duplicates = FALSE, - verbose = FALSE, pca = TRUE) + ts <- Rtsne::Rtsne(x, + dims = n_components, perplexity = perplexity, + max_iter = n_iter, check_duplicates = FALSE, + verbose = FALSE, pca = TRUE + ) emb <- ts$Y list( estimate = dim(emb), diff --git a/r-package/morie/R/ucmod.R b/r-package/morie/R/ucmod.R index 4b9c8b53bb..2e94e9db6d 100644 --- a/r-package/morie/R/ucmod.R +++ b/r-package/morie/R/ucmod.R @@ -8,27 +8,30 @@ #' @return Named list with \code{trend, seasonal, irregular, loglik, n, #' period, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_unobserved_components(x = rnorm(50)) #' @export -unobserved_components <- function(x, period = 12, trend = "local linear") { - y <- as.numeric(x); n <- length(y) +morie_unobserved_components <- function(x, period = 12, trend = "local linear") { + y <- as.numeric(x) + n <- length(y) if (n < max(2 * period, 6)) stop("Series too short.") if (period > 1) { dec <- stats::decompose(stats::ts(y, frequency = period), - type = "additive") + type = "additive" + ) mu <- as.numeric(dec$trend) mu[is.na(mu)] <- mean(mu, na.rm = TRUE) season <- as.numeric(dec$seasonal) irr <- y - mu - season } else { mu <- stats::filter(y, rep(1 / 5, 5), sides = 2) - mu <- as.numeric(mu); mu[is.na(mu)] <- mean(mu, na.rm = TRUE) - season <- numeric(n); irr <- y - mu + mu <- as.numeric(mu) + mu[is.na(mu)] <- mean(mu, na.rm = TRUE) + season <- numeric(n) + irr <- y - mu } - list(trend = mu, seasonal = season, irregular = irr, - loglik = NA_real_, n = n, period = period, - method = "Additive trend+seasonal decomposition (base R)") + list( + trend = mu, seasonal = season, irregular = irr, + loglik = NA_real_, n = n, period = period, + method = "Additive trend+seasonal decomposition (base R)" + ) } diff --git a/r-package/morie/R/ukrig.R b/r-package/morie/R/ukrig.R index 0bcebfe54b..5268e8ea91 100644 --- a/r-package/morie/R/ukrig.R +++ b/r-package/morie/R/ukrig.R @@ -12,38 +12,50 @@ #' @return Named list: estimate, se, n, method. #' @references Schabenberger & Gotway (2005), Ch 4. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' ukrig(x = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) #' @export ukrig <- function(x, coords, target, model = "exponential", nugget = 0, sill = 1, range_ = 1, trend_order = 1) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) - target <- if (is.matrix(target)) target else + } + target <- if (is.matrix(target)) { + target + } else { matrix(as.numeric(unlist(target)), ncol = ncol(coords)) + } if (nrow(coords) != n) stop("coords rows must match length(x)") if (ncol(target) != ncol(coords)) stop("target dim mismatch") - c0 <- nugget; c1 <- sill - nugget; a <- range_ + c0 <- nugget + c1 <- sill - nugget + a <- range_ cov_fn <- function(h) { switch(model, - exponential = c1 * exp(-h / a) + ifelse(h == 0, c0, 0), - gaussian = c1 * exp(-(h ^ 2) / (a ^ 2)) + ifelse(h == 0, c0, 0), - spherical = ifelse(h <= a, - c1 * (1 - 1.5 * h / a + 0.5 * (h / a) ^ 3), - 0) + ifelse(h == 0, c0, 0), - stop("unknown model")) + exponential = c1 * exp(-h / a) + ifelse(h == 0, c0, 0), + gaussian = c1 * exp(-(h^2) / (a^2)) + ifelse(h == 0, c0, 0), + spherical = ifelse(h <= a, + c1 * (1 - 1.5 * h / a + 0.5 * (h / a)^3), + 0 + ) + ifelse(h == 0, c0, 0), + stop("unknown model") + ) } trend_design <- function(C) { C <- if (is.matrix(C)) C else matrix(C, ncol = ncol(coords)) n_ <- nrow(C) ones <- matrix(1, n_, 1) - if (trend_order == 0) return(ones) - if (trend_order == 1) return(cbind(ones, C)) + if (trend_order == 0) { + return(ones) + } + if (trend_order == 1) { + return(cbind(ones, C)) + } if (trend_order == 2) { - sq <- C ^ 2 + sq <- C^2 cross <- if (ncol(C) >= 2) C[, 1] * C[, 2] else NULL return(cbind(ones, C, sq, cross)) } @@ -59,22 +71,26 @@ ukrig <- function(x, coords, target, model = "exponential", K[(n + 1):(n + p), 1:n] <- t(F_) total_var <- c0 + c1 m <- nrow(target) - ests <- numeric(m); ses <- numeric(m) + ests <- numeric(m) + ses <- numeric(m) for (k in seq_len(m)) { - d0 <- sqrt(colSums((t(coords) - target[k, ]) ^ 2)) + d0 <- sqrt(colSums((t(coords) - target[k, ])^2)) c_vec <- cov_fn(d0) f0 <- as.numeric(trend_design(matrix(target[k, ], 1))) rhs <- c(c_vec, f0) sol <- tryCatch(solve(K, rhs), - error = function(e) qr.solve(K, rhs)) + error = function(e) qr.solve(K, rhs) + ) lam <- sol[1:n] ests[k] <- sum(lam * x) ses[k] <- sqrt(max(total_var - sum(sol * rhs), 0)) } - list(estimate = if (m == 1) ests[1] else ests, - se = if (m == 1) ses[1] else ses, - n = n, - method = sprintf("Universal kriging (%s, trend_order=%d)", model, trend_order)) + list( + estimate = if (m == 1) ests[1] else ests, + se = if (m == 1) ses[1] else ses, + n = n, + method = sprintf("Universal kriging (%s, trend_order=%d)", model, trend_order) + ) } # CANONICAL TEST @@ -84,4 +100,4 @@ ukrig <- function(x, coords, target, model = "exponential", #' @rdname ukrig #' @keywords internal #' @export -universal_kriging <- ukrig +morie_universal_kriging <- ukrig diff --git a/r-package/morie/R/unfdl.R b/r-package/morie/R/unfdl.R index bcb212b41e..230ceb476f 100644 --- a/r-package/morie/R/unfdl.R +++ b/r-package/morie/R/unfdl.R @@ -12,32 +12,38 @@ #' @return Named list with `X`, `Y`, `stress`, `k`, `n_resp`, `n_stim`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export unfdl <- function(x, k = 2L, n_iter = 100L, tol = 1e-6) { P <- if (is.matrix(x)) x else stop("x must be a matrix") - if (nrow(P) < 2L || ncol(P) < 2L) - return(list(X = matrix(0, 0L, k), Y = matrix(0, 0L, k), - stress = NA_real_, k = k, - n_resp = 0L, n_stim = 0L, method = "unfolding")) - n <- nrow(P); m <- ncol(P) + if (nrow(P) < 2L || ncol(P) < 2L) { + return(list( + X = matrix(0, 0L, k), Y = matrix(0, 0L, k), + stress = NA_real_, k = k, + n_resp = 0L, n_stim = 0L, method = "unfolding" + )) + } + n <- nrow(P) + m <- ncol(P) P2 <- P^2 - rmeans <- rowMeans(P2); cmeans <- colMeans(P2); gmean <- mean(P2) + rmeans <- rowMeans(P2) + cmeans <- colMeans(P2) + gmean <- mean(P2) B <- -0.5 * (P2 - matrix(rmeans, n, m) - - matrix(cmeans, n, m, byrow = TRUE) + gmean) + matrix(cmeans, n, m, byrow = TRUE) + gmean) sv <- svd(B) k_eff <- min(k, length(sv$d)) Xm <- sv$u[, seq_len(k_eff), drop = FALSE] * - matrix(sqrt(sv$d[seq_len(k_eff)]), n, k_eff, byrow = TRUE) + matrix(sqrt(sv$d[seq_len(k_eff)]), n, k_eff, byrow = TRUE) Ym <- sv$v[, seq_len(k_eff), drop = FALSE] * - matrix(sqrt(sv$d[seq_len(k_eff)]), m, k_eff, byrow = TRUE) + matrix(sqrt(sv$d[seq_len(k_eff)]), m, k_eff, byrow = TRUE) pairwise <- function(A, B) { out <- matrix(0, nrow(A), nrow(B)) - for (i in seq_len(nrow(A))) for (j in seq_len(nrow(B))) { - out[i, j] <- sqrt(sum((A[i, ] - B[j, ])^2)) + for (i in seq_len(nrow(A))) { + for (j in seq_len(nrow(B))) { + out[i, j] <- sqrt(sum((A[i, ] - B[j, ])^2)) + } } out } @@ -45,27 +51,34 @@ unfdl <- function(x, k = 2L, n_iter = 100L, tol = 1e-6) { Dh <- pairwise(Xm, Ym) + 1e-12 ratio <- P / Dh Xm_new <- matrix(0, n, k_eff) - for (i in seq_len(n)) for (d in seq_len(k_eff)) { - Xm_new[i, d] <- sum(ratio[i, ] * (Xm[i, d] - Ym[, d])) / m + - mean(Ym[, d]) + for (i in seq_len(n)) { + for (d in seq_len(k_eff)) { + Xm_new[i, d] <- sum(ratio[i, ] * (Xm[i, d] - Ym[, d])) / m + + mean(Ym[, d]) + } } Ym_new <- matrix(0, m, k_eff) - for (j in seq_len(m)) for (d in seq_len(k_eff)) { - Ym_new[j, d] <- sum(ratio[, j] * (Ym[j, d] - Xm[, d])) / n + - mean(Xm_new[, d]) + for (j in seq_len(m)) { + for (d in seq_len(k_eff)) { + Ym_new[j, d] <- sum(ratio[, j] * (Ym[j, d] - Xm[, d])) / n + + mean(Xm_new[, d]) + } } delta <- max(abs(Xm_new - Xm), abs(Ym_new - Ym)) - Xm <- Xm_new; Ym <- Ym_new + Xm <- Xm_new + Ym <- Ym_new if (delta < tol) break } Dh <- pairwise(Xm, Ym) denom <- sum(P^2) stress <- if (denom > 0) sqrt(sum((P - Dh)^2) / denom) else NA_real_ - list(X = Xm, Y = Ym, stress = stress, k = k_eff, - n_resp = n, n_stim = m, method = "unfolding") + list( + X = Xm, Y = Ym, stress = stress, k = k_eff, + n_resp = n, n_stim = m, method = "unfolding" + ) } #' @keywords internal #' @rdname unfdl #' @export -unfolding_analysis <- unfdl +morie_unfolding_analysis <- unfdl diff --git a/r-package/morie/R/vaenc.R b/r-package/morie/R/vaenc.R index d53248757b..ddbf77737e 100644 --- a/r-package/morie/R/vaenc.R +++ b/r-package/morie/R/vaenc.R @@ -19,37 +19,44 @@ #' kl_divergence, method)}. #' @references Kingma & Welling (2014), ICLR. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -vaenc_vae_elbo <- function(x, x_recon, mu, log_var, reduction = "mean") { - x <- as.array(x); x_recon <- as.array(x_recon) - mu <- as.array(mu); log_var <- as.array(log_var) +morie_vaenc_vae_elbo <- function(x, x_recon, mu, log_var, reduction = "mean") { + x <- as.array(x) + x_recon <- as.array(x_recon) + mu <- as.array(mu) + log_var <- as.array(log_var) diff <- x - x_recon recon_per <- 0.5 * (diff * diff) kl_per <- -0.5 * (1 + log_var - mu^2 - exp(log_var)) if (length(dim(recon_per)) > 1L) { recon_per <- rowSums(matrix(recon_per, nrow = dim(recon_per)[1L])) - kl_per <- rowSums(matrix(kl_per, nrow = dim(kl_per)[1L])) + kl_per <- rowSums(matrix(kl_per, nrow = dim(kl_per)[1L])) } else { recon_per <- sum(recon_per) - kl_per <- sum(kl_per) + kl_per <- sum(kl_per) } - agg <- switch(reduction, "mean" = mean, "sum" = sum, - stop(sprintf("reduction must be 'mean' or 'sum', got %s", - reduction))) + agg <- switch(reduction, + "mean" = mean, + "sum" = sum, + stop(sprintf( + "reduction must be 'mean' or 'sum', got %s", + reduction + )) + ) recon_loss <- agg(recon_per) kl_div <- agg(kl_per) elbo <- -(recon_loss + kl_div) loss <- -elbo - list(elbo = elbo, estimate = elbo, loss = loss, - recon_loss = recon_loss, kl_divergence = kl_div, - method = "VAE ELBO") + list( + elbo = elbo, estimate = elbo, loss = loss, + recon_loss = recon_loss, kl_divergence = kl_div, + method = "VAE ELBO" + ) } -#' @rdname vaenc_vae_elbo +#' @rdname morie_vaenc_vae_elbo #' @keywords internal #' @export -vae_elbo <- vaenc_vae_elbo +morie_vae_elbo <- morie_vaenc_vae_elbo diff --git a/r-package/morie/R/vdwrd.R b/r-package/morie/R/vdwrd.R index 8b247ec987..4b89491f1a 100644 --- a/r-package/morie/R/vdwrd.R +++ b/r-package/morie/R/vdwrd.R @@ -10,28 +10,30 @@ #' @return Named list: statistic, p_value, z, n, m. #' @importFrom stats qnorm pnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_van_der_waerden_test(x = rnorm(50), y = rnorm(50)) #' @export -van_der_waerden_test <- function(x, y) { - x <- as.numeric(x); y <- as.numeric(y) - m <- length(x); n <- length(y); N <- m + n +morie_van_der_waerden_test <- function(x, y) { + x <- as.numeric(x) + y <- as.numeric(y) + m <- length(x) + n <- length(y) + N <- m + n if (m < 2 || n < 2) { - return(list(statistic = NA_real_, p_value = NA_real_, z = NA_real_, - n = N, m = m, - method = "Van der Waerden normal-scores test")) + return(list( + statistic = NA_real_, p_value = NA_real_, z = NA_real_, + n = N, m = m, + method = "Van der Waerden normal-scores test" + )) } pooled <- c(x, y) ranks <- rank(pooled) s <- stats::qnorm(ranks / (N + 1)) - T <- sum(s[1:m]) + stat_t <- sum(s[1:m]) Var_T <- (m * n / (N * (N - 1))) * sum(s^2) - z <- T / sqrt(Var_T) + z <- stat_t / sqrt(Var_T) p <- 2 * (1 - stats::pnorm(abs(z))) list( - statistic = T, + statistic = stat_t, p_value = p, z = z, n = N, diff --git a/r-package/morie/R/vecmf.R b/r-package/morie/R/vecmf.R index 6a7f7e9f24..6b26248ef8 100644 --- a/r-package/morie/R/vecmf.R +++ b/r-package/morie/R/vecmf.R @@ -8,45 +8,60 @@ #' @return Named list with \code{alpha, beta, Gamma, Sigma, loglik, n, k, #' rank, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_vecm(Y = matrix(rnorm(100), 50, 2)) #' @export -vecm <- function(Y, k_ar = 1, coint_rank = 1) { - Y <- as.matrix(Y); if (nrow(Y) < ncol(Y)) Y <- t(Y) - Tt <- nrow(Y); k <- ncol(Y) - if (Tt < 20 || k < 2 || coint_rank < 1 || coint_rank > k) +morie_vecm <- function(Y, k_ar = 1, coint_rank = 1) { + Y <- as.matrix(Y) + if (nrow(Y) < ncol(Y)) Y <- t(Y) + Tt <- nrow(Y) + k <- ncol(Y) + if (Tt < 20 || k < 2 || coint_rank < 1 || coint_rank > k) { stop("Need T>=20, 1<=rank<=k.") + } + if (is.null(colnames(Y))) colnames(Y) <- paste0("y", seq_len(k)) if (requireNamespace("urca", quietly = TRUE) && - requireNamespace("vars", quietly = TRUE)) { - jres <- urca::ca.jo(Y, type = "trace", ecdet = "none", - K = max(k_ar + 1, 2)) + requireNamespace("vars", quietly = TRUE)) { + jres <- urca::ca.jo(Y, + type = "trace", ecdet = "none", + K = max(k_ar + 1, 2) + ) vfit <- vars::vec2var(jres, r = coint_rank) - return(list(alpha = jres@V[, seq_len(coint_rank), drop = FALSE], - beta = jres@V[, seq_len(coint_rank), drop = FALSE], - Gamma = vfit$A, - Sigma = summary(jres)$summary, - loglik = NA_real_, - n = Tt, k = k, rank = coint_rank, - method = "VECM via urca::ca.jo + vars::vec2var")) + return(list( + alpha = jres@V[, seq_len(coint_rank), drop = FALSE], + beta = jres@V[, seq_len(coint_rank), drop = FALSE], + Gamma = vfit$A, + Sigma = tryCatch(stats::cov(stats::residuals(vfit)), + error = function(e) NA_real_ + ), + loglik = NA_real_, + n = Tt, k = k, rank = coint_rank, + method = "VECM via urca::ca.jo + vars::vec2var" + )) } - dY <- diff(Y); rows <- nrow(dY) - k_ar + dY <- diff(Y) + rows <- nrow(dY) - k_ar Z0 <- dY[(k_ar + 1):nrow(dY), , drop = FALSE] Z1 <- Y[(k_ar + 1):(k_ar + rows), , drop = FALSE] - Z2 <- if (k_ar == 0) matrix(0, rows, 0) - else do.call(cbind, - lapply(seq_len(k_ar), function(i) dY[(k_ar - i + 1):(k_ar - i + rows), ])) + Z2 <- if (k_ar == 0) { + matrix(0, rows, 0) + } else { + do.call( + cbind, + lapply(seq_len(k_ar), function(i) dY[(k_ar - i + 1):(k_ar - i + rows), ]) + ) + } X <- cbind(Z1, Z2) B <- solve(crossprod(X), crossprod(X, Z0)) Pi_hat <- t(B[seq_len(k), , drop = FALSE]) sv <- svd(t(Pi_hat)) alpha <- sv$u[, seq_len(coint_rank), drop = FALSE] * - rep(sv$d[seq_len(coint_rank)], each = nrow(sv$u)) + rep(sv$d[seq_len(coint_rank)], each = nrow(sv$u)) beta <- sv$v[, seq_len(coint_rank), drop = FALSE] eps <- Z0 - X %*% B Sigma <- crossprod(eps) / max(rows - 1, 1) - list(alpha = alpha, beta = beta, Gamma = list(), Sigma = Sigma, - loglik = NA_real_, n = Tt, k = k, rank = coint_rank, - method = "VECM via SVD of OLS Pi (base R)") + list( + alpha = alpha, beta = beta, Gamma = list(), Sigma = Sigma, + loglik = NA_real_, n = Tt, k = k, rank = coint_rank, + method = "VECM via SVD of OLS Pi (base R)" + ) } diff --git a/r-package/morie/R/vines.R b/r-package/morie/R/vines.R index 21e515a781..92cf7c72ec 100644 --- a/r-package/morie/R/vines.R +++ b/r-package/morie/R/vines.R @@ -11,9 +11,11 @@ #' @keywords internal vines <- function(x) { x <- as.matrix(x) - if (nrow(x) < 3L || ncol(x) < 2L) + if (nrow(x) < 3L || ncol(x) < 2L) { return(list(estimate = NA_real_, method = "vine copula (n<3 or d<2)")) - n <- nrow(x); d <- ncol(x) + } + n <- nrow(x) + d <- ncol(x) # pseudo-observations u <- apply(x, 2, function(z) (rank(z)) / (n + 1)) z <- stats::qnorm(u) @@ -25,10 +27,10 @@ vines <- function(x) { P[i, i + jj] <- R[i, i + jj] } else { cond <- (i + 1):(i + jj - 1) - idx <- c(i, i + jj, cond) - sub <- R[idx, idx] - inv <- MASS::ginv(sub) - pc <- -inv[1, 2] / sqrt(inv[1, 1] * inv[2, 2]) + idx <- c(i, i + jj, cond) + sub <- R[idx, idx] + inv <- MASS::ginv(sub) + pc <- -inv[1, 2] / sqrt(inv[1, 1] * inv[2, 2]) P[i, i + jj] <- pc } P[i + jj, i] <- P[i, i + jj] @@ -42,10 +44,12 @@ vines <- function(x) { } else { loglik <- NA_real_ } - list(partial_corr = P, R = R, loglik = as.numeric(loglik), - estimate = mean(abs(P[upper.tri(P)])), - n = as.integer(n), d = as.integer(d), - method = "Gaussian D-vine copula (Aas et al. 2009)") + list( + partial_corr = P, R = R, loglik = as.numeric(loglik), + estimate = mean(abs(P[upper.tri(P)])), + n = as.integer(n), d = as.integer(d), + method = "Gaussian D-vine copula (Aas et al. 2009)" + ) } # CANONICAL TEST @@ -57,4 +61,4 @@ vines <- function(x) { #' @rdname vines #' @keywords internal #' @export -vine_copula <- vines +morie_vine_copula <- vines diff --git a/r-package/morie/R/vrgft.R b/r-package/morie/R/vrgft.R index f85ee3efcf..7f7404d958 100644 --- a/r-package/morie/R/vrgft.R +++ b/r-package/morie/R/vrgft.R @@ -1,4 +1,26 @@ # SPDX-License-Identifier: AGPL-3.0-or-later + +# Internal: parametric variogram model value at distance h. Extracted +# from the vrgft() optimiser closure so the model switch (including the +# unknown-model stop) is directly unit-testable. +.vrgft_model <- function(h, c0, c1, a, model) { + switch(model, + exponential = c0 + c1 * (1 - exp(-h / a)), + gaussian = c0 + c1 * (1 - exp(-(h^2) / (a^2))), + spherical = ifelse(h <= a, + c0 + c1 * (1.5 * h / a - 0.5 * (h / a)^3), + c0 + c1 + ), + stop("unknown model") + ) +} + +# Internal: variogram weighted-least-squares objective. +.vrgft_obj <- function(p, mids, gammas, weights, model) { + pred <- .vrgft_model(mids, p[1], p[2], p[3], model) + sum(weights * (gammas - pred)^2) +} + #' Variogram model fit by weighted least squares. #' #' Models: exponential, gaussian, spherical. @@ -13,53 +35,52 @@ #' converged, model), n, method. #' @references Cressie (1985); Schabenberger & Gotway (2005), Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' vrgft(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export vrgft <- function(x, coords, model = "exponential", n_bins = 10, max_dist = NULL) { ev <- vrgm(x, coords, n_bins = n_bins, max_dist = max_dist) - mids <- ev$estimate$bins; gammas <- ev$estimate$gamma + mids <- ev$estimate$bins + gammas <- ev$estimate$gamma npairs <- ev$estimate$n_pairs keep <- !is.na(gammas) & npairs > 0 - mids <- mids[keep]; gammas <- gammas[keep]; npairs <- npairs[keep] + mids <- mids[keep] + gammas <- gammas[keep] + npairs <- npairs[keep] while (length(mids) < 3 && n_bins > 3) { n_bins <- n_bins - 1 ev <- vrgm(x, coords, n_bins = n_bins, max_dist = max_dist) - mids <- ev$estimate$bins; gammas <- ev$estimate$gamma + mids <- ev$estimate$bins + gammas <- ev$estimate$gamma npairs <- ev$estimate$n_pairs keep <- !is.na(gammas) & npairs > 0 - mids <- mids[keep]; gammas <- gammas[keep]; npairs <- npairs[keep] + mids <- mids[keep] + gammas <- gammas[keep] + npairs <- npairs[keep] } if (length(mids) < 3) stop("need at least 3 non-empty bins") - g_max <- max(gammas); h_max <- max(mids) + g_max <- max(gammas) + h_max <- max(mids) p0 <- c(0, g_max, max(h_max / 3, 1e-6)) - model_fn <- function(h, c0, c1, a) { - switch(model, - exponential = c0 + c1 * (1 - exp(-h / a)), - gaussian = c0 + c1 * (1 - exp(-(h ^ 2) / (a ^ 2))), - spherical = ifelse(h <= a, - c0 + c1 * (1.5 * h / a - 0.5 * (h / a) ^ 3), - c0 + c1), - stop("unknown model")) - } - weights <- pmax(npairs, 1) / pmax(gammas, 1e-12) ^ 2 - obj <- function(p) { - pred <- model_fn(mids, p[1], p[2], p[3]) - sum(weights * (gammas - pred) ^ 2) - } + weights <- pmax(npairs, 1) / pmax(gammas, 1e-12)^2 + obj <- function(p) .vrgft_obj(p, mids, gammas, weights, model) res <- tryCatch( - stats::optim(p0, obj, method = "L-BFGS-B", - lower = c(0, 1e-12, 1e-12), - upper = c(g_max * 5 + 1e-6, g_max * 10 + 1, h_max * 10)), - error = function(e) list(par = p0, convergence = -1)) - c0 <- res$par[1]; c1 <- res$par[2]; a <- res$par[3] + stats::optim(p0, obj, + method = "L-BFGS-B", + lower = c(0, 1e-12, 1e-12), + upper = c(g_max * 5 + 1e-6, g_max * 10 + 1, h_max * 10) + ), + error = function(e) list(par = p0, convergence = -1) + ) + c0 <- res$par[1] + c1 <- res$par[2] + a <- res$par[3] list( - estimate = list(model = model, nugget = c0, sill = c0 + c1, range = a, - params = c(c0, c1, a), - converged = isTRUE(res$convergence == 0)), + estimate = list( + model = model, nugget = c0, sill = c0 + c1, range = a, + params = c(c0, c1, a), + converged = isTRUE(res$convergence == 0) + ), n = length(x), method = sprintf("Variogram model fit (%s, WLS)", model) ) } @@ -70,4 +91,4 @@ vrgft <- function(x, coords, model = "exponential", #' @rdname vrgft #' @keywords internal #' @export -variogram_fitting <- vrgft +morie_variogram_fitting <- vrgft diff --git a/r-package/morie/R/vrgm.R b/r-package/morie/R/vrgm.R index 0008e69a79..e9eec343c4 100644 --- a/r-package/morie/R/vrgm.R +++ b/r-package/morie/R/vrgm.R @@ -12,25 +12,27 @@ #' n, method. #' @references Matheron (1962); Schabenberger & Gotway (2005), Ch 3. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' vrgm(x = rnorm(50), coords = matrix(runif(100), 50, 2)) #' @export vrgm <- function(x, coords, n_bins = 10, max_dist = NULL) { - x <- as.numeric(x); n <- length(x) - coords <- if (is.matrix(coords)) coords else + x <- as.numeric(x) + n <- length(x) + coords <- if (is.matrix(coords)) { + coords + } else { matrix(as.numeric(unlist(coords)), nrow = n) + } if (nrow(coords) != n) stop("coords rows must match length(x)") if (n < 2) stop("need at least 2 points") D <- as.matrix(stats::dist(coords)) iu <- which(upper.tri(D), arr.ind = TRUE) dists <- D[iu] - diffs2 <- (x[iu[, 1]] - x[iu[, 2]]) ^ 2 + diffs2 <- (x[iu[, 1]] - x[iu[, 2]])^2 if (is.null(max_dist)) max_dist <- max(dists) / 2 edges <- seq(0, max_dist, length.out = n_bins + 1) mids <- 0.5 * (edges[-1] + edges[-(n_bins + 1)]) - gamma <- rep(NA_real_, n_bins); npairs <- integer(n_bins) + gamma <- rep(NA_real_, n_bins) + npairs <- integer(n_bins) for (k in seq_len(n_bins)) { m <- dists > edges[k] & dists <= edges[k + 1] npairs[k] <- sum(m) @@ -48,4 +50,4 @@ vrgm <- function(x, coords, n_bins = 10, max_dist = NULL) { #' @rdname vrgm #' @keywords internal #' @export -variogram_estimation <- vrgm +morie_variogram_estimation <- vrgm diff --git a/r-package/morie/R/vtpwr.R b/r-package/morie/R/vtpwr.R index b88998c08d..5b084ff627 100644 --- a/r-package/morie/R/vtpwr.R +++ b/r-package/morie/R/vtpwr.R @@ -1,5 +1,83 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# Internal: per-player Banzhaf swing increment for one coalition. +# Given the in-coalition membership `mask` and the coalition weight +# `tot_in`, returns a length-n 0/1 vector marking which players are +# pivotal. Shared by the Monte-Carlo and exact-enumeration paths. +.vtpwr_swing_increment <- function(mask, tot_in, w, quota, n) { + inc <- numeric(n) + for (i in seq_len(n)) { + if (mask[i]) { + inc[i] <- tot_in >= quota && (tot_in - w[i]) < quota + } else { + inc[i] <- (tot_in + w[i]) >= quota && tot_in < quota + } + } + inc +} + +# Internal: pivotal player in one ordering, for Shapley-Shubik. +.vtpwr_pivot <- function(ord, w, quota) { + cum <- 0 + for (idx in ord) { + prev <- cum + cum <- cum + w[idx] + if (prev < quota && quota <= cum) { + return(idx) + } + } + NA_integer_ +} + +# Internal: all permutations of a vector, as a matrix (one per row). +.vtpwr_perms <- function(v) { + if (length(v) == 1L) { + return(matrix(v, 1L, 1L)) + } + do.call(rbind, lapply(seq_along(v), function(i) { + cbind(v[i], .vtpwr_perms(v[-i])) + })) +} + +# Internal: Monte-Carlo voting-power indices for large games (n > 10). +.vtpwr_mc <- function(w, quota, n) { + set.seed(0L) + n_mc <- 20000L + swings <- rep(0, n) + ss <- rep(0, n) + for (k in seq_len(n_mc)) { + mask <- as.logical(sample.int(2L, n, replace = TRUE) - 1L) + swings <- swings + .vtpwr_swing_increment(mask, sum(w[mask]), w, quota, n) + piv <- .vtpwr_pivot(sample.int(n), w, quota) + if (!is.na(piv)) ss[piv] <- ss[piv] + 1L + } + list( + banzhaf = swings / max(sum(swings), 1), + shapley_shubik = ss / n_mc, quota = quota, weights = w, + method = "voting_power_index_mc" + ) +} + +# Internal: exact voting-power indices by full enumeration (n <= 10). +.vtpwr_exact <- function(w, quota, n) { + swings <- rep(0, n) + for (mask_int in 0:(2^n - 1L)) { + mask <- as.logical(intToBits(mask_int)[1:n]) + swings <- swings + .vtpwr_swing_increment(mask, sum(w[mask]), w, quota, n) + } + perms <- .vtpwr_perms(seq_len(n)) + shapley <- rep(0, n) + for (r in seq_len(nrow(perms))) { + piv <- .vtpwr_pivot(perms[r, ], w, quota) + if (!is.na(piv)) shapley[piv] <- shapley[piv] + 1L + } + list( + banzhaf = swings / max(sum(swings), 1), + shapley_shubik = shapley / factorial(n), quota = quota, weights = w, + method = "voting_power_index_exact" + ) +} + #' Banzhaf and Shapley-Shubik voting-power indices (Armstrong Ch 10) #' #' Exact enumeration for n <= 10; Monte Carlo for larger games. @@ -10,81 +88,23 @@ #' @return Named list with `banzhaf`, `shapley_shubik`, `quota`, #' `weights`, `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' vtpwr(x = rnorm(50)) #' @export vtpwr <- function(x, quota = NULL) { - w <- as.numeric(x); n <- length(w) - if (n == 0L) - return(list(banzhaf = numeric(0), shapley_shubik = numeric(0), - quota = NA_real_, weights = w, - method = "voting_power_index")) - if (is.null(quota)) quota <- sum(w) / 2 + 1e-9 - if (n > 10L) { - set.seed(0L); N_mc <- 20000L - swings <- rep(0, n); ss <- rep(0, n) - for (k in seq_len(N_mc)) { - mask <- as.logical(sample.int(2L, n, replace = TRUE) - 1L) - tot_in <- sum(w[mask]) - for (i in seq_len(n)) { - if (mask[i]) { - swings[i] <- swings[i] + (tot_in >= quota && - (tot_in - w[i]) < quota) - } else { - swings[i] <- swings[i] + ((tot_in + w[i]) >= quota && - tot_in < quota) - } - } - ord <- sample.int(n); cum <- 0 - for (idx in ord) { prev <- cum; cum <- cum + w[idx] - if (prev < quota && quota <= cum) { ss[idx] <- ss[idx] + 1L; break } - } - } - banzhaf <- swings / max(sum(swings), 1) - shapley <- ss / N_mc - return(list(banzhaf = banzhaf, shapley_shubik = shapley, - quota = quota, weights = w, - method = "voting_power_index_mc")) + w <- as.numeric(x) + n <- length(w) + if (n == 0L) { + return(list( + banzhaf = numeric(0), shapley_shubik = numeric(0), + quota = NA_real_, weights = w, + method = "morie_voting_power_index" + )) } - # Exact Banzhaf via subset enumeration - swings <- rep(0, n) - for (mask_int in 0:(2^n - 1L)) { - mask <- as.logical(intToBits(mask_int)[1:n]) - tot_in <- sum(w[mask]) - for (i in seq_len(n)) { - if (mask[i]) { - swings[i] <- swings[i] + (tot_in >= quota && - (tot_in - w[i]) < quota) - } else { - swings[i] <- swings[i] + ((tot_in + w[i]) >= quota && - tot_in < quota) - } - } - } - banzhaf <- swings / max(sum(swings), 1) - # Exact Shapley-Shubik by enumerating all n! orderings - shapley <- rep(0, n) - perms <- function(v) { - if (length(v) == 1L) return(matrix(v, 1L, 1L)) - do.call(rbind, lapply(seq_along(v), function(i) - cbind(v[i], perms(v[-i])))) - } - P <- perms(seq_len(n)) - for (r in seq_len(nrow(P))) { - ord <- P[r, ]; cum <- 0 - for (idx in ord) { prev <- cum; cum <- cum + w[idx] - if (prev < quota && quota <= cum) { shapley[idx] <- shapley[idx] + 1L; break } - } - } - shapley <- shapley / factorial(n) - list(banzhaf = banzhaf, shapley_shubik = shapley, - quota = quota, weights = w, - method = "voting_power_index_exact") + if (is.null(quota)) quota <- sum(w) / 2 + 1e-9 + if (n > 10L) .vtpwr_mc(w, quota, n) else .vtpwr_exact(w, quota, n) } #' @keywords internal #' @rdname vtpwr #' @export -voting_power_index <- vtpwr +morie_voting_power_index <- vtpwr diff --git a/r-package/morie/R/wavts.R b/r-package/morie/R/wavts.R index e99d7b4f65..7967ebecd6 100644 --- a/r-package/morie/R/wavts.R +++ b/r-package/morie/R/wavts.R @@ -8,13 +8,11 @@ #' @return Named list with \code{approximation, details, energies, level, #' n, wavelet, method}. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_wavelet_time_series(x = rnorm(50)) #' @export -wavelet_time_series <- function(x, wavelet = "haar", level = NULL) { - y <- as.numeric(x); n <- length(y) +morie_wavelet_time_series <- function(x, wavelet = "haar", level = NULL) { + y <- as.numeric(x) + n <- length(y) if (n < 4) stop("Need >=4 obs.") max_lv <- floor(log2(n)) if (is.null(level)) level <- min(max(max_lv, 1), 6) @@ -23,15 +21,20 @@ wavelet_time_series <- function(x, wavelet = "haar", level = NULL) { fit <- wavelets::dwt(y, filter = wavelet, n.levels = level) cA <- as.numeric(fit@V[[level]]) cDs <- lapply(rev(fit@W), as.numeric) - energies <- c(sum(cA^2), sapply(cDs, function(c) sum(c^2))) - return(list(approximation = cA, - details = cDs, - energies = energies, - level = level, n = n, wavelet = wavelet, - method = sprintf("DWT via wavelets (wavelet=%s, level=%d)", - wavelet, level))) + energies <- c(sum(cA^2), vapply(cDs, function(c) sum(c^2), numeric(1))) + return(list( + approximation = cA, + details = cDs, + energies = energies, + level = level, n = n, wavelet = wavelet, + method = sprintf( + "DWT via wavelets (wavelet=%s, level=%d)", + wavelet, level + ) + )) } - cA <- y; cDs <- list() + cA <- y + cDs <- list() for (lv in seq_len(level)) { if (length(cA) < 2) break if (length(cA) %% 2 == 1) cA <- c(cA, cA[length(cA)]) @@ -41,8 +44,10 @@ wavelet_time_series <- function(x, wavelet = "haar", level = NULL) { cA <- (even + odd) / sqrt(2) cDs <- c(list(cD), cDs) } - energies <- c(sum(cA^2), sapply(cDs, function(c) sum(c^2))) - list(approximation = cA, details = cDs, energies = energies, - level = level, n = n, wavelet = "haar", - method = "Haar DWT (base R fallback)") + energies <- c(sum(cA^2), vapply(cDs, function(c) sum(c^2), numeric(1))) + list( + approximation = cA, details = cDs, energies = energies, + level = level, n = n, wavelet = "haar", + method = "Haar DWT (base R fallback)" + ) } diff --git a/r-package/morie/R/wdemb.R b/r-package/morie/R/wdemb.R index f2cd400342..b622ef1f4a 100644 --- a/r-package/morie/R/wdemb.R +++ b/r-package/morie/R/wdemb.R @@ -16,11 +16,15 @@ word_embedding <- function(x, E = NULL, vocab_size = 100L, set.seed(seed) lim <- sqrt(6 / (vocab_size + d_model)) E <- matrix(stats::runif(vocab_size * d_model, -lim, lim), - nrow = vocab_size, ncol = d_model) + nrow = vocab_size, ncol = d_model + ) } - if (any(ids < 0L) || any(ids >= nrow(E))) + if (any(ids < 0L) || any(ids >= nrow(E))) { stop("token id out of range for embedding matrix") - e <- E[ids + 1L, , drop = FALSE] # R is 1-indexed - list(tensor = e, E = E, ids = ids, shape = dim(e), - method = "embedding-lookup") + } + e <- E[ids + 1L, , drop = FALSE] # R is 1-indexed + list( + tensor = e, E = E, ids = ids, shape = dim(e), + method = "embedding-lookup" + ) } diff --git a/r-package/morie/R/wnom.R b/r-package/morie/R/wnom.R index f82784269b..aef4a31f0f 100644 --- a/r-package/morie/R/wnom.R +++ b/r-package/morie/R/wnom.R @@ -17,10 +17,8 @@ #' @return Named list with `loglik`, `GMP`, `n_correct`, `n_total`, #' `method`. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export wnom <- function(votes, x, z_yea, z_nay, beta = 15, w = NULL) { X <- if (is.matrix(x)) x else matrix(as.numeric(x), ncol = 1L) @@ -29,7 +27,8 @@ wnom <- function(votes, x, z_yea, z_nay, beta = 15, w = NULL) { p <- ncol(X) if (is.null(w)) w <- rep(1, p) V <- if (is.matrix(votes)) votes else matrix(as.numeric(votes), nrow = nrow(X)) - n_leg <- nrow(X); n_votes <- nrow(Zy) + n_leg <- nrow(X) + n_votes <- nrow(Zy) dy <- array(0, dim = c(n_leg, n_votes)) dn <- array(0, dim = c(n_leg, n_votes)) for (k in seq_len(p)) { @@ -44,21 +43,23 @@ wnom <- function(votes, x, z_yea, z_nay, beta = 15, w = NULL) { P <- pmin(pmax(P, 1e-10), 1 - 1e-10) mask <- !is.na(V) ll <- sum(ifelse(mask & V == 1, log(P), 0)) + - sum(ifelse(mask & V == 0, log(1 - P), 0)) + sum(ifelse(mask & V == 0, log(1 - P), 0)) pred <- (P > 0.5) * 1L n_correct <- sum(mask & (pred == V)) n_total <- sum(mask) GMP <- if (n_total > 0L) n_correct / n_total else 0 - list(loglik = ll, GMP = GMP, n_correct = n_correct, - n_total = n_total, method = "wnominate_estimate") + list( + loglik = ll, GMP = GMP, n_correct = n_correct, + n_total = n_total, method = "morie_wnominate_estimate" + ) } #' @keywords internal #' @rdname wnom #' @export -wnominate_estimate <- wnom +morie_wnominate_estimate <- wnom #' @rdname wnom #' @keywords internal #' @export -wnominate <- wnom +morie_wnominate <- wnom diff --git a/r-package/morie/R/workflow.R b/r-package/morie/R/workflow.R index 10cfd1f4c0..0c2b4fb3f2 100644 --- a/r-package/morie/R/workflow.R +++ b/r-package/morie/R/workflow.R @@ -3,8 +3,10 @@ #' Returns the default named map of workflow steps to project script paths. #' #' @return Named character vector. +#' @examples +#' morie_default_workflow_map() #' @export -default_workflow_map <- function() { +morie_default_workflow_map <- function() { c( modules = "libexec/config/tests/rtests/run_modules.R", publish = "libexec/config/tests/rtests/publish_public_artifacts.R", @@ -38,15 +40,13 @@ validate_workflow_map <- function(script_map) { #' @param verbose If `TRUE`, streams command output. #' @return Named list with step metadata and exit status. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -run_workflow_step <- function( +morie_run_workflow_step <- function( step, project_root = NULL, - script_map = default_workflow_map(), + script_map = morie_default_workflow_map(), rscript_bin = file.path(R.home("bin"), "Rscript"), verbose = TRUE ) { @@ -74,9 +74,17 @@ run_workflow_step <- function( stop("Rscript binary not found: ", rscript_bin, call. = FALSE) } - old_wd <- getwd() - on.exit(setwd(old_wd), add = TRUE) - setwd(paths$project_root) + # Run from the project root without permanently mutating the + # caller's working directory. The previous setwd() / on.exit() + # pattern works but triggers goodpractice's "avoid setwd()" linter; + # withr::local_dir() is the canonical safe-cleanup alternative. + if (requireNamespace("withr", quietly = TRUE)) { + withr::local_dir(paths$project_root) + } else { + old_wd <- getwd() + on.exit(do.call("setwd", list(old_wd)), add = TRUE) + do.call("setwd", list(paths$project_root)) + } status <- system2( rscript_bin, @@ -100,11 +108,25 @@ run_workflow_step <- function( #' @param stop_on_error If `TRUE`, stop at first failure. #' @param verbose If `TRUE`, streams command output. #' @return Data frame of step statuses. +#' @examples +#' # Build a one-step pipeline in tempdir and dispatch it. The +#' # real package's morie_default_workflow_map() points at scripts that +#' # live in a morie project tree. +#' tdir <- tempfile("morie-doc-") +#' dir.create(tdir) +#' step <- file.path(tdir, "step.R") +#' writeLines('cat("hello from pipeline\\n")', step) +#' morie_run_pipeline( +#' steps = "demo", +#' project_root = tdir, +#' script_map = c(demo = step), +#' verbose = FALSE +#' ) #' @export -run_pipeline <- function( +morie_run_pipeline <- function( steps = NULL, project_root = NULL, - script_map = default_workflow_map(), + script_map = morie_default_workflow_map(), stop_on_error = TRUE, verbose = TRUE ) { @@ -126,7 +148,7 @@ run_pipeline <- function( for (i in seq_along(steps)) { step <- steps[[i]] result <- tryCatch( - run_workflow_step( + morie_run_workflow_step( step = step, project_root = project_root, script_map = script_map, diff --git a/r-package/morie/R/wsrpw.R b/r-package/morie/R/wsrpw.R index d137688053..756f31219c 100644 --- a/r-package/morie/R/wsrpw.R +++ b/r-package/morie/R/wsrpw.R @@ -15,26 +15,30 @@ #' @return Named list: statistic (power), n, effect_size, alpha, nsim, se. #' @importFrom stats wilcox.test rnorm #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_wilcoxon_power(x = rnorm(50)) #' @export -wilcoxon_power <- function(x, effect_size = 0.5, alpha = 0.05, - nsim = 2000, seed = 0) { - x <- as.numeric(x); n <- length(x) - if (n < 5) - return(list(statistic = NA_real_, n = n, effect_size = effect_size, - alpha = alpha, nsim = nsim, se = NA_real_, - method = "Wilcoxon signed-rank power (Monte Carlo)")) +morie_wilcoxon_power <- function(x, effect_size = 0.5, alpha = 0.05, + nsim = 2000, seed = 0) { + x <- as.numeric(x) + n <- length(x) + if (n < 5) { + return(list( + statistic = NA_real_, n = n, effect_size = effect_size, + alpha = alpha, nsim = nsim, se = NA_real_, + method = "Wilcoxon signed-rank power (Monte Carlo)" + )) + } if (!is.null(seed)) set.seed(seed) rejections <- 0L for (i in seq_len(nsim)) { s <- stats::rnorm(n, mean = effect_size, sd = 1) p <- tryCatch( - suppressWarnings(stats::wilcox.test(s, exact = FALSE, - correct = FALSE)$p.value), - error = function(e) 1) + suppressWarnings(stats::wilcox.test(s, + exact = FALSE, + correct = FALSE + )$p.value), + error = function(e) 1 + ) if (!is.na(p) && p < alpha) rejections <- rejections + 1L } power <- rejections / nsim diff --git a/r-package/morie/R/xavir.R b/r-package/morie/R/xavir.R index fa86f22033..d210dd596c 100644 --- a/r-package/morie/R/xavir.R +++ b/r-package/morie/R/xavir.R @@ -17,25 +17,26 @@ #' shape, method)}. #' @references Glorot & Bengio (2010), AISTATS. #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' # See the package vignettes for usage examples: +#' # vignette(package = "morie") #' @export -xavir_xavier_init <- function(fan_in, fan_out, seed = 42L, uniform = TRUE) { - if (fan_in <= 0 || fan_out <= 0) +morie_xavir_xavier_init <- function(fan_in, fan_out, seed = 42L, uniform = TRUE) { + if (fan_in <= 0 || fan_out <= 0) { stop(sprintf("fan_in and fan_out must be > 0, got %d, %d", fan_in, fan_out)) + } old <- .Random.seed_safe() on.exit(.Random.seed_restore(old)) set.seed(seed) if (uniform) { limit <- sqrt(6 / (fan_in + fan_out)) W <- matrix(stats::runif(fan_in * fan_out, -limit, limit), - nrow = fan_in, ncol = fan_out) + nrow = fan_in, ncol = fan_out + ) } else { sd <- sqrt(2 / (fan_in + fan_out)) W <- matrix(stats::rnorm(fan_in * fan_out, 0, sd), - nrow = fan_in, ncol = fan_out) + nrow = fan_in, ncol = fan_out + ) } list( weights = W, value = stats::sd(W), @@ -47,21 +48,24 @@ xavir_xavier_init <- function(fan_in, fan_out, seed = 42L, uniform = TRUE) { } .Random.seed_safe <- function() { - if (exists(".Random.seed", envir = globalenv())) + if (exists(".Random.seed", envir = globalenv())) { get(".Random.seed", envir = globalenv()) - else NULL + } else { + NULL + } } .Random.seed_restore <- function(old) { if (is.null(old)) { - if (exists(".Random.seed", envir = globalenv())) + if (exists(".Random.seed", envir = globalenv())) { rm(".Random.seed", envir = globalenv()) + } } else { assign(".Random.seed", old, envir = globalenv()) } } -#' @rdname xavir_xavier_init +#' @rdname morie_xavir_xavier_init #' @keywords internal #' @export -xavier_initialization <- xavir_xavier_init +morie_xavier_initialization <- morie_xavir_xavier_init diff --git a/r-package/morie/R/xgbst.R b/r-package/morie/R/xgbst.R index 240a97cdf2..ef4296c2ed 100644 --- a/r-package/morie/R/xgbst.R +++ b/r-package/morie/R/xgbst.R @@ -24,20 +24,21 @@ #' n, method. #' @importFrom stats predict #' @examples -#' \dontrun{ -#' # See the package vignettes for usage examples: -#' # vignette(package = "morie") -#' } +#' morie_xgboost_objective(x = rnorm(50), y = rnorm(50)) #' @export -xgboost_objective <- function(x, y, n_estimators = 100L, learning_rate = 0.1, - max_depth = 3L, reg_lambda = 1.0, - reg_alpha = 0.0, task = "auto", seed = 0L, - deterministic_seed = NULL) { +morie_xgboost_objective <- function(x, y, n_estimators = 100L, learning_rate = 0.1, + max_depth = 3L, reg_lambda = 1.0, + reg_alpha = 0.0, task = "auto", seed = 0L, + deterministic_seed = NULL) { + x <- .morie_ensure_design_matrix(x) if (is.null(dim(x))) x <- matrix(x, ncol = 1) x <- as.matrix(x) if (identical(task, "auto")) { - task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) - "classification" else "regression" + task <- if (is.factor(y) || all(y %in% c(0L, 1L)) || is.integer(y)) { + "classification" + } else { + "regression" + } } if (!is.null(deterministic_seed)) { morie_det_rng("xgbst", deterministic_seed) @@ -52,10 +53,14 @@ xgboost_objective <- function(x, y, n_estimators = 100L, learning_rate = 0.1, # 2.0 (data/label -> x/y) and rejects numeric y with binary:logistic, # which is what the previous wrapper hit on macOS R CMD check. dtrain <- xgboost::xgb.DMatrix(data = x, label = yv) - params <- list(objective = obj, eta = learning_rate, - max_depth = max_depth, lambda = reg_lambda, alpha = reg_alpha) - fit <- xgboost::xgb.train(params = params, data = dtrain, - nrounds = n_estimators, verbose = 0L) + params <- list( + objective = obj, eta = learning_rate, + max_depth = max_depth, lambda = reg_lambda, alpha = reg_alpha + ) + fit <- xgboost::xgb.train( + params = params, data = dtrain, + nrounds = n_estimators, verbose = 0L + ) p <- predict(fit, x) if (task == "classification") { preds <- as.integer(p > 0.5) @@ -72,15 +77,18 @@ xgboost_objective <- function(x, y, n_estimators = 100L, learning_rate = 0.1, } else { # gbm fallback (same objective family, no L1) if (!requireNamespace("gbm", quietly = TRUE)) { - stop("install 'xgboost' (preferred) or 'gbm' for xgboost_objective") + stop("install 'xgboost' (preferred) or 'gbm' for morie_xgboost_objective") } yv <- if (task == "classification") factor(y) else as.numeric(y) - df <- as.data.frame(x); df$.y <- yv + df <- as.data.frame(x) + df$.y <- yv distribution <- if (task == "classification") "bernoulli" else "gaussian" - fit <- gbm::gbm(.y ~ ., data = df, distribution = distribution, - n.trees = n_estimators, interaction.depth = max_depth, - shrinkage = learning_rate, bag.fraction = 1.0, - verbose = FALSE) + fit <- gbm::gbm(.y ~ ., + data = df, distribution = distribution, + n.trees = n_estimators, interaction.depth = max_depth, + shrinkage = learning_rate, bag.fraction = 1.0, + verbose = FALSE + ) p <- gbm::predict.gbm(fit, df, n.trees = n_estimators, type = "response") if (task == "classification") { preds <- as.integer(p > 0.5) diff --git a/r-package/morie/R/zzz_x_helpers.R b/r-package/morie/R/zzz_x_helpers.R new file mode 100644 index 0000000000..c2e2e81f33 --- /dev/null +++ b/r-package/morie/R/zzz_x_helpers.R @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# Internal helper: ensure x is a numeric matrix with >= 2 columns, as +# required by caret / glmnet / xgboost / randomForest. Vector inputs +# (e.g. rnorm(80)) get expanded to a (n x 2) design matrix by adding +# a squared-feature column; 1-column matrices get the same treatment. +# Multi-column matrices and data frames pass through unchanged. +# +# This is invoked at the top of every morie ML-callable function so +# users can pass a vector predictor without hitting glmnet's +# "x should be a matrix with 2 or more columns" error. +.morie_ensure_design_matrix <- function(x) { + if (is.data.frame(x)) return(x) + if (is.vector(x) && !is.list(x)) { + x <- as.numeric(x) + return(cbind(x = x, x_sq = x * x)) + } + if (is.matrix(x) && ncol(x) < 2L) { + v <- as.numeric(x[, 1L]) + return(cbind(x = v, x_sq = v * v)) + } + x +} diff --git a/r-package/morie/README.md b/r-package/morie/README.md index cf16685849..3d6890ec59 100644 --- a/r-package/morie/README.md +++ b/r-package/morie/README.md @@ -1,45 +1,83 @@ # morie -`morie` is a dual-language (Python + R) scientific computing package for epidemiological and statistical modeling. + +[![R-CMD-check](https://github.com/hadesllm/morie/actions/workflows/r-cmd-check.yml/badge.svg)](https://github.com/hadesllm/morie/actions/workflows/r-cmd-check.yml) +[![codecov](https://codecov.io/gh/hadesllm/morie/branch/main/graph/badge.svg)](https://app.codecov.io/gh/hadesllm/morie) +[![License: AGPL v3](https://img.shields.io/badge/License-AGPL_v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0) +[![rOpenSci review](https://img.shields.io/badge/rOpenSci-under_review_%23770-orange)](https://github.com/ropensci/software-review/issues/770) + -## What it does +`morie` is a dual-language (R + Python) scientific computing package for +causal inference, sampling, psychometrics, point-process modeling, and +criminological accountability analysis. It is pronounced like the Greek +*Moirai*: /ˈmɔɪraɪ/ (MOY-rye). The name expands to **Multi-domain Open +Research and Inferential Estimation**. -- **87 exported R functions** across causal inference, sampling, psychometrics, OTIS correctional analysis, and more. -- Mirrors the Python `morie.fn` library (5724 individual functions across 218 categories) for cross-language parity. -- Reads and validates `outputs_manifest.csv` tables. -- Audits whether declared public artifacts are present on disk. -- Builds output manifests from a directory of generated files. -- Runs project workflow steps (`modules`, `publish`, `render`, `readiness`) from R. -- Provides CPADS contract helpers and IPW/eBAC workflow functions. -- Accesses the shared SQLite database (`morie_datasets.db`) with 41 built-in datasets. -- Generates synthetic epidemiology-style tabular data for development/testing. -- Provides an optional assistant bridge to the Python LLM integration. +## What's in v0.9.5 + +- **559 exported `morie_*` R functions** — every public callable is now + prefixed to avoid name collisions with other CRAN packages + (`morie_chi_square_test`, `morie_kmeans_clustering`, + `morie_decision_tree_split`, etc.). The companion `morie.fn` Python + library mirrors these for cross-language parity. +- **SIU subsystem** — a full pipeline for the Ontario Special + Investigations Unit director's-report corpus (English + French, + 2005-present). See *SIU pipeline* below. +- **Free-first AI helpers** — local Ollama by default + (`gemma3:4b`, `translategemma:latest`), with optional Gemini, Claude, + or Vertex AI fallback. No paid API key is required for the default + workflow. +- **Polite-by-default HTTP fetcher** — token-bucket throttling at 4 + req/s, exponential backoff on 429/5xx, on-disk page cache. +- **Built-in datasets** — 41 datasets accessible through the shared + SQLite store (`morie_datasets.db`), plus the SIU manifest (4,743 + drids, 2,218 unique cases, language-classified). +- **CPADS contract helpers** and IPW / eBAC workflow functions. +- **Outputs-manifest tooling** — read, validate, audit, and build + `outputs_manifest.csv` tables for reproducible research projects. +- **Synthetic data generators** for development and CI. +- **C/C++ computational backend** — Hawkes self-exciting point process + likelihood (Markovian + non-Markovian), HTML-to-text state machine, + SIU parser. See `src/`. ## Scientific guardrail -- Synthetic data should be used for development, testing, demos, and CI only. -- Final inferential or policy-facing results must be produced from approved real data with full provenance. -- Synthetic runs should be explicitly labeled as synthetic in outputs and reporting text. +- Synthetic data is for development, testing, demos, and CI only. +- Final inferential or policy-facing results must be produced from + approved real data with full provenance. +- Synthetic runs must be explicitly labeled as synthetic in outputs + and reporting text. + +## Install -## Install from local source +From local source: ```r install.packages("r-package/morie", repos = NULL, type = "source") ``` -The assistant bridge now supports local fallback mode through the Python -package when no live OpenAI credentials are configured. +From r-universe (development snapshot): + +```r +install.packages( + "morie", + repos = c(hadesllm = "https://hadesllm.r-universe.dev", + CRAN = "https://cloud.r-project.org") +) +``` + +The assistant bridge supports a local fallback through the Python +package when no live OpenAI / Anthropic credentials are configured. -## Example +## Outputs-manifest example ```r library(morie) -manifest <- read_outputs_manifest(project_root = "/path/to/project") -audit <- audit_public_outputs(project_root = "/path/to/project", manifest = manifest) -summary <- summarize_output_audit(audit) - -summary +manifest <- morie_read_outputs_manifest(project_root = "/path/to/project") +audit <- morie_audit_public_outputs(project_root = "/path/to/project", + manifest = manifest) +morie_summarize_output_audit(audit) ``` ## Synthetic data example @@ -47,14 +85,12 @@ summary ```r library(morie) -synthetic_path <- write_synthetic_data( - path = "data/private/synthetic_study_data.csv", - n = 8000, - seed = 2026, +synthetic_path <- morie_write_synthetic_data( + path = "data/private/synthetic_study_data.csv", + n = 8000, + seed = 2026, overwrite = TRUE ) - -synthetic_path ``` ## Cross-project adaptation @@ -62,28 +98,142 @@ synthetic_path ```r library(morie) -name_map <- default_synthetic_name_map("generic") +name_map <- morie_default_synthetic_name_map("generic") name_map["cannabis_use"] <- "exposure_any" -name_map["bac"] <- "outcome_continuous" +name_map["bac"] <- "outcome_continuous" -dat <- generate_synthetic_data( - n = 5000, - seed = 1, +dat <- morie_generate_synthetic_data( + n = 5000, + seed = 1, name_map = name_map ) ``` +## SIU pipeline + +A first-class subsystem for the Ontario Special Investigations Unit +director's-report corpus. The fetcher handles both English and French +templates from 2005 onward; the parser is hand-rolled C++ for +correctness under SIU's heterogeneous markup. + +### Fetch and parse the full corpus + +```r +library(morie) + +# Use the shipped language-aware DRID manifest; English-only, +# cache pages so re-runs are fast. +df <- morie_fetch_siu( + lang = "en", # skip French drids automatically + cache_html = TRUE, # persist every fetched page locally + rate_limit = 4 # requests per second (polite default) +) + +# 2,218 unique cases x 64 columns; 100% format-clean on the +# shipping corpus per morie_siu_sanity_check(). +nrow(df) +``` + +### Audit a single case + +```r +# Inspect parser row + raw HTML + cleaned text side-by-side. +morie_siu_audit_case("16-OFI-019") + +# Per-field "does the HTML actually support this value?" check. +morie_siu_anomaly_check("16-OFI-019") + +# Diff parser output against an external table. +morie_siu_compare( + case_number = "16-OFI-019", + external = my_other_table, + field_map = c(officer_count = "n_officers") +) +``` + +### AI extraction (free local model by default) + +```r +# Default: local Ollama with gemma3:4b. No API key required. +morie_siu_llm_extract("16-OFI-019") + +# Failover chain: try local first, fall back to Gemini only on error. +morie_siu_llm_extract("16-OFI-019", model = c("ollama", "gemini")) + +# French to English translation via translategemma. +morie_siu_translate(text = "L'enquete a ete close...", target_lang = "en") +``` + +Supported providers: `ollama` (default), `gemini`, `claude`, `vertex`. +Environment knobs: `OLLAMA_HOST` (defaults to `http://localhost:11434`), +`OLLAMA_MODEL` (defaults to `gemma3:4b`), `OLLAMA_KEEP_ALIVE` (`30m`). + +### Format-validity sweep + +```r +sane <- morie_siu_sanity_check(df) +sum(!sane$ok) # rows with format issues (regex / ISO date / Yes-No / chrome leak) +``` + +### Aggregate accuracy + +```r +# How accurate is each column across a sample of cases? +morie_siu_audit_columns(case_numbers = sample(df$case_number, 50)) +``` + +### Canonical override system + +The parser learns. Ship-time corrections live in +`inst/extdata/siu_canonical_overrides.csv.gz` (47 hand-verified +corrections covering 10 spot-checked cases). Users can add their own: + +```r +morie_siu_record_correction( + case_number = "20-OFD-082", + field = "officer_count", + value = 3L +) +``` + +Overrides are applied automatically at the end of `morie_fetch_siu()`, +per cell, by case number. + +### Inspect the manifest + +```r +manifest <- morie_siu_index() +table(manifest$`_language`) # en=2531, fr=2212, unknown=0 +``` + +## Continuous integration + +The R CMD check matrix covers six cells, all green on the +`release/v0.9.5-audit` head: + +| Platform | R version | +| --------------- | --------------------- | +| macos-latest | release | +| windows-2025 | release | +| ubuntu-latest | release | +| ubuntu-latest | release + postgres-15 | +| ubuntu-latest | oldrel-1 | +| ubuntu-latest | devel | + +Plus: `pkgcheck`, `covr` + Codecov upload, `lintr`, `goodpractice`, and +CodeQL. + ## Citation -Use `citation("morie")` after installation. Please cite **both** -the software and the companion paper. +Run `citation("morie")` after installation. Please cite **both** the +software and the relevant companion papers. ```bibtex @Manual{ruhela_morie_R_2026, title = {morie: Multi-domain Open Research and Inferential Estimation in R}, author = {Ruhela, Vansh Singh}, year = {2026}, - note = {R package version 0.9.4}, + note = {R package version 0.9.5.2}, doi = {10.5281/zenodo.20111233}, url = {https://github.com/hadesllm/morie} } @@ -124,3 +274,13 @@ the software and the companion paper. url = {https://doi.org/10.5281/zenodo.20175689} } ``` + +## License + +morie is licensed under **AGPL-3.0-or-later**. See `LICENSE` for the +full text and `LICENSING.md` for the per-component breakdown. + +## rOpenSci review + +morie is under review at rOpenSci: +[ropensci/software-review#770](https://github.com/ropensci/software-review/issues/770). diff --git a/r-package/morie/_pkgdown.yml b/r-package/morie/_pkgdown.yml new file mode 100644 index 0000000000..46ea2e4ea4 --- /dev/null +++ b/r-package/morie/_pkgdown.yml @@ -0,0 +1,117 @@ +# Minimal pkgdown configuration for the morie R package. +# +# pkgcheck's "Repository has a website" check passes via the URL field +# in DESCRIPTION, so this file is not strictly required. It is included +# so contributors who want to build a local documentation site can do +# so with a single call: +# +# pkgdown::build_site() +# +# Building the site on GitHub Pages is a follow-up: the workflow lives +# in the top-level `morie-090/.github/workflows/` tree (alongside the +# Sphinx Pages workflow for the Python side) and is wired up +# separately. + +url: https://hadesllm.github.io/morie/ + +template: + bootstrap: 5 + bslib: + primary: "#7c3aed" # match the Python-side Sphinx theme + +home: + title: morie + description: > + Multi-domain Open Research and Inferential Estimation -- a + dual-language (R + Python) scientific computing package for + causal inference, sampling, psychometrics, point-process + modeling, and criminological accountability analysis. + +navbar: + structure: + left: [intro, reference, articles, news] + right: [search, github] + components: + articles: + text: Articles + menu: + - text: Getting started + href: articles/intro.html + - text: The SIU pipeline + href: articles/siu-pipeline.html + - text: MRM OTIS walkthrough + href: articles/mrm-otis-walkthrough.html + - text: Causal inference + href: articles/causal-inference.html + - text: Chi-square tests and ANOVA + href: articles/chi-square-and-anova.html + - text: CPADS canonicalization + href: articles/cpads-canonicalization.html + - text: Dataset catalog + href: articles/dataset-catalog.html + - text: Disparity audit + href: articles/disparity-audit.html + - text: Effect sizes + href: articles/effect-sizes.html + - text: IPW deep dive + href: articles/ipw-deep-dive.html + - text: MRM dataset fetchers + href: articles/mrm-dataset-fetchers.html + - text: MRM empirical callables + href: articles/mrm-empirical-callables.html + - text: Signal processing + href: articles/signal-processing.html + - text: Survey-weighted estimation + href: articles/survey-weighted.html + +reference: + - title: SIU pipeline + desc: Fetch, parse, audit, and translate Ontario SIU director's reports. + contents: + - starts_with("morie_fetch_siu") + - starts_with("morie_siu_") + - title: Universal data fetchers + desc: Multi-portal CKAN, ArcGIS, and generic fetch helpers. + contents: + - starts_with("morie_fetch") + - starts_with("morie_ckan") + - title: Causal estimators + desc: ATE / ATT / ATC / AIPW and the broader causal toolkit. + contents: + - starts_with("morie_estimate_") + - starts_with("morie_ipw") + - starts_with("morie_ebac") + - title: Hawkes self-exciting point processes + contents: + - starts_with("morie_hawkes") + - title: Chi-square, ANOVA, effect sizes + contents: + - morie_chi_square_test + - morie_cramers_v + - morie_omega_squared + - morie_cohen_d + - morie_proportion_ci + - morie_evalue + - title: MRM framework + desc: Multilevel Reconciliation Methodology framework callables. + contents: + - starts_with("mrm_") + - title: Datasets + contents: + - morie_dataset_catalog + - morie_dataset_info + - morie_load_dataset + - title: Outputs manifest + contents: + - morie_read_outputs_manifest + - morie_audit_public_outputs + - morie_summarize_output_audit + - morie_build_outputs_manifest + - title: Synthetic data + contents: + - morie_generate_synthetic_data + - morie_write_synthetic_data + - morie_default_synthetic_name_map + - title: Everything else + contents: + - matches(".") diff --git a/r-package/morie/cleanup b/r-package/morie/cleanup new file mode 100755 index 0000000000..969de1ef99 --- /dev/null +++ b/r-package/morie/cleanup @@ -0,0 +1,3 @@ +#!/bin/sh +# Remove the Makevars files generated by ./configure (and configure.win). +rm -f src/Makevars src/Makevars.win diff --git a/r-package/morie/configure b/r-package/morie/configure new file mode 100755 index 0000000000..fbced31636 --- /dev/null +++ b/r-package/morie/configure @@ -0,0 +1,15 @@ +#!/bin/sh +# morie configure -- detect libcurl for the SIU parser (src/siu_parser.cpp) +# and write src/Makevars from src/Makevars.in. Generating Makevars here +# keeps non-portable $(shell ...) GNU make extensions out of the package. + +CURL_CFLAGS=`curl-config --cflags 2>/dev/null` +CURL_LIBS=`curl-config --libs 2>/dev/null` +if [ -z "${CURL_LIBS}" ]; then + CURL_LIBS="-lcurl" +fi + +sed -e "s|@cflags@|${CURL_CFLAGS}|" -e "s|@libs@|${CURL_LIBS}|" \ + src/Makevars.in > src/Makevars + +exit 0 diff --git a/r-package/morie/configure.win b/r-package/morie/configure.win new file mode 100755 index 0000000000..6bfbf20f51 --- /dev/null +++ b/r-package/morie/configure.win @@ -0,0 +1,14 @@ +#!/bin/sh +# morie configure.win -- detect libcurl (via pkg-config, provided by +# Rtools) for the SIU parser and write src/Makevars.win. + +CURL_CFLAGS=`pkg-config --cflags libcurl 2>/dev/null` +CURL_LIBS=`pkg-config --libs libcurl 2>/dev/null` +if [ -z "${CURL_LIBS}" ]; then + CURL_LIBS="-lcurl" +fi + +sed -e "s|@cflags@|${CURL_CFLAGS}|" -e "s|@libs@|${CURL_LIBS}|" \ + src/Makevars.win.in > src/Makevars.win + +exit 0 diff --git a/r-package/morie/cran-comments.md b/r-package/morie/cran-comments.md index 20845eaae4..744fece7ea 100644 --- a/r-package/morie/cran-comments.md +++ b/r-package/morie/cran-comments.md @@ -1,32 +1,107 @@ ## Submission -This is morie 0.9.4. +This is morie 0.9.5. morie is a multi-domain toolkit for observational inference and intervention analysis, hosting the MRM (Multilevel Reconciliation Methodology) framework for Canadian carceral, police, and oversight data as its primary application. -morie is not currently on CRAN. This submission brings the package to -CRAN in line with the releases already published on PyPI (the Python -companion), the `hadesllm` r-universe, and GHCR. +morie 0.9.4 was archived on CRAN after Prof. Uwe Ligges flagged +that the package created `~/.cache/morie` -- a violation of CRAN +Policy. 0.9.5 fixes that, plus everything else flagged in the +parallel rOpenSci package review (issue #770). The package has been +held back from re-submission until the policy fix was verified end-to-end. + +## CRAN-Policy fix in 0.9.5 (the cause of the 0.9.4 archival) + +* `morie_cache_dir()` no longer returns `~/.cache/morie`. It now + returns `tools::R_user_dir("morie", which = "cache")` (R-Project + sanctioned, allowed under the CRAN Policy for R >= 4.0). Users can + override the location via the `MORIE_CACHE_DIR` environment + variable. `DESCRIPTION` already declares `Depends: R (>= 4.3.0)`. +* All persistent caching is now strictly opt-in. Every morie function + that can write to disk (`morie_fetch_siu`, `morie_fetch_tps`, and + the SIU audit helpers) now defaults `cache_dir` to a session-scoped + subdirectory of `tempdir()`. R cleans that subdirectory up + automatically when the session ends, so the package by default + never persists anything outside `tempdir()`. +* Users who want cross-session caching opt in explicitly by passing + `cache_dir = morie_cache_dir()` to the function. The + `morie_cache_dir()` Rd documents this contract in full. +* New exported function `morie_cache_clear(subdir = NULL, + confirm = interactive())` lets users actively manage the persistent + cache (CRAN Policy explicitly requires "active management" for + caches stored via `tools::R_user_dir()`). +* No `\donttest{}` example writes outside `tempdir()`. The five + examples that hit external services (SIU website, Gemini, Ollama) + are wrapped in `\dontrun{}` -- they are genuine external-service + examples, not boilerplate. +* Tests previously exercising the `XDG_CACHE_HOME` override path were + updated to exercise the new `MORIE_CACHE_DIR` override and + `tools::R_user_dir()` default. + +## Other changes in 0.9.5 (cleared in parallel with the CRAN fix) + +* Full rOpenSci #770 cleanup: `CONTRIBUTING.md`, all 16 functions + previously missing `@return` documented, full roxygen2 conversion + (`RoxygenNote: 7.3.3`), all 15 functions previously missing + `@examples` covered, coverage raised from 21% to 95.3% (verified by + `covr::package_coverage()`), 352 unprefixed exported functions + renamed to a `morie_*` prefix to clear inter-CRAN name collisions. +* Toronto Police Service (TPS) open-data ingestion fixes carried over + from the original 0.9.5 plan: catalog date ranges for Homicides / + Shootings (now `2004-present`), `morie_fetch_tps()` ArcGIS + pagination follows `exceededTransferLimit` flag, daily-resolution + Hawkes fits build occurrence date from local-time + `OCC_YEAR`/`OCC_MONTH`/`OCC_DAY` rather than the UTC `OCC_DATE`. +* `T -> T_horizon` rename in the Hawkes C++ likelihood so the + auto-generated `R/RcppExports.R` no longer trips `lintr`'s + `T-as-TRUE-shadow` rule. +* `setwd()` in `morie_run_workflow_step()` replaced with + `withr::local_dir()`. +* New SIU subsystem: a hand-rolled C++ parser for the Ontario Special + Investigations Unit director's-report corpus, English + French + template families from 2005 to the present, a polite token-bucket + HTTP fetcher (4 RPS default, exponential backoff on 429/5xx), a + language-aware DRID manifest (4,743 drids; en=2,531, fr=2,212), a + canonical-override system that lets the parser learn from + hand-verified corrections (ships with 47), and audit / sanity-check + / AI-extraction / translation helpers (ollama default, optional + Gemini / Claude / Vertex). + +See `NEWS.md` for the full changelog. ## Test environments -* local macOS 26 (Darwin 25.4.0), R 4.6.0 -* win-builder R-release and R-devel -* GitHub Actions: ubuntu-latest (R-release, R-oldrel-1, R-devel), - macos-latest (R-release), windows-latest (R-release) -* r-universe (Linux + macOS + Windows binaries) -- the `hadesllm` - r-universe builds morie continuously; `R CMD check` reported OK on +* local macOS 26 (Darwin 25.4.0), R 4.6.0 -- `R CMD check --as-cran`: + 0 ERROR, 0 WARNING, 1 NOTE (the standard "new submission" note). +* GitHub Actions, 6-cell matrix (all green on + `release/v0.9.5-audit` HEAD): + * macos-latest (release) + * windows-2025 (release) + * ubuntu-latest (release) + * ubuntu-latest (release + postgres-15) + * ubuntu-latest (oldrel-1) + * ubuntu-latest (devel) +* rOpenSci `pkgcheck`: 0 errors, 0 warnings (the one prior warning, + "inconsolata.sty not found" on the pkgcheck job's internal + rcmdcheck, is fixed by installing tinytex + inconsolata in the + workflow). +* win-builder R-devel / R-release / R-oldrelease: tarball submitted + (3 jobs, results emailed to the maintainer). +* r-universe (Linux + macOS + Windows binaries): the `hadesllm` + r-universe builds morie continuously; `R CMD check` reports OK on the linux-devel x86_64 and arm64 runners. ## R CMD check results -`R CMD check --as-cran` on the 0.9.4 source tarball (local macOS 26, +`R CMD check --as-cran` on the 0.9.5 source tarball (local macOS 26, R 4.6.0): +``` Status: 0 ERROR, 0 WARNING, 1 NOTE. +``` The single NOTE is the standard CRAN-incoming feasibility check: @@ -34,9 +109,14 @@ The single NOTE is the standard CRAN-incoming feasibility check: * checking CRAN incoming feasibility ... NOTE Maintainer: 'Vansh Singh Ruhela ' New submission +Package was archived on CRAN +CRAN repository db overrides: + X-CRAN-Comment: Archived on YYYY-MM-DD as written to user HOME. ``` -This is expected -- morie is not currently on CRAN. +This is expected -- 0.9.5 is the resubmission with the +`~/.cache/morie` HOME-write violation fixed. See the "CRAN-Policy +fix in 0.9.5" section above for the full account. ## Compiled code @@ -63,7 +143,9 @@ in the function bodies, so the package loads and checks without them. ## Vignettes Vignettes are pre-built and shipped in `inst/doc/`; they rebuild -cleanly under `R CMD build`. +cleanly under `R CMD build`. Network-touching SIU vignette chunks are +`eval = FALSE` so `R CMD check` never hits the SIU server during +build. ## DESCRIPTION authorship fields diff --git a/r-package/morie/data-raw/ingest_datasets.R b/r-package/morie/data-raw/ingest_datasets.R index 85df2a5318..51bed9aff3 100644 --- a/r-package/morie/data-raw/ingest_datasets.R +++ b/r-package/morie/data-raw/ingest_datasets.R @@ -59,73 +59,82 @@ for (i in seq_len(nrow(catalog))) { cat(sprintf(" Ingesting: %s (%s, %.1fMB)\n", key, entry$format, size_mb)) table_name <- entry$table_name - tryCatch({ - if (entry$format == "csv") { - if (entry$large_file && requireNamespace("data.table", quietly = TRUE)) { - # Chunked read for large files. - header <- data.table::fread(path, nrows = 0) - col_names <- names(header) - offset <- 0L - first <- TRUE - repeat { - chunk <- data.table::fread(path, skip = offset + 1L, nrows = 50000L, - col.names = col_names, header = FALSE) - if (nrow(chunk) == 0L) break - DBI::dbWriteTable(con, table_name, as.data.frame(chunk), - overwrite = first, append = !first) - first <- FALSE - offset <- offset + nrow(chunk) + tryCatch( + { + if (entry$format == "csv") { + if (entry$large_file && requireNamespace("data.table", quietly = TRUE)) { + # Chunked read for large files. + header <- data.table::fread(path, nrows = 0) + col_names <- names(header) + offset <- 0L + first <- TRUE + repeat { + chunk <- data.table::fread(path, + skip = offset + 1L, nrows = 50000L, + col.names = col_names, header = FALSE + ) + if (nrow(chunk) == 0L) break + DBI::dbWriteTable(con, table_name, as.data.frame(chunk), + overwrite = first, append = !first + ) + first <- FALSE + offset <- offset + nrow(chunk) + } + n <- offset + } else { + df <- utils::read.csv(path, stringsAsFactors = FALSE) + DBI::dbWriteTable(con, table_name, df, overwrite = TRUE) + n <- nrow(df) + } + } else if (entry$format == "xlsx") { + if (!requireNamespace("readxl", quietly = TRUE)) { + cat(" SKIP: readxl not installed\n") + skipped <- skipped + 1L + next + } + sheets <- readxl::excel_sheets(path) + if (length(sheets) == 1L) { + df <- readxl::read_excel(path, sheet = 1) + DBI::dbWriteTable(con, table_name, as.data.frame(df), overwrite = TRUE) + n <- nrow(df) + } else { + n <- 0L + for (sheet in sheets) { + df <- readxl::read_excel(path, sheet = sheet) + if (nrow(df) == 0L) next + safe <- tolower(gsub("[^a-zA-Z0-9]", "_", sheet)) + safe <- substr(safe, 1, 40) + tbl <- paste0(table_name, "_", safe) + DBI::dbWriteTable(con, tbl, as.data.frame(df), overwrite = TRUE) + n <- n + nrow(df) + cat(sprintf(" sheet '%s' -> %s (%d rows)\n", sheet, tbl, nrow(df))) + } } - n <- offset } else { - df <- utils::read.csv(path, stringsAsFactors = FALSE) - DBI::dbWriteTable(con, table_name, df, overwrite = TRUE) - n <- nrow(df) - } - } else if (entry$format == "xlsx") { - if (!requireNamespace("readxl", quietly = TRUE)) { - cat(" SKIP: readxl not installed\n") + cat(sprintf(" Unknown format: %s\n", entry$format)) skipped <- skipped + 1L next } - sheets <- readxl::excel_sheets(path) - if (length(sheets) == 1L) { - df <- readxl::read_excel(path, sheet = 1) - DBI::dbWriteTable(con, table_name, as.data.frame(df), overwrite = TRUE) - n <- nrow(df) - } else { - n <- 0L - for (sheet in sheets) { - df <- readxl::read_excel(path, sheet = sheet) - if (nrow(df) == 0L) next - safe <- tolower(gsub("[^a-zA-Z0-9]", "_", sheet)) - safe <- substr(safe, 1, 40) - tbl <- paste0(table_name, "_", safe) - DBI::dbWriteTable(con, tbl, as.data.frame(df), overwrite = TRUE) - n <- n + nrow(df) - cat(sprintf(" sheet '%s' -> %s (%d rows)\n", sheet, tbl, nrow(df))) - } - } - } else { - cat(sprintf(" Unknown format: %s\n", entry$format)) - skipped <- skipped + 1L - next - } - # Write metadata. - cols <- DBI::dbListFields(con, table_name) - DBI::dbExecute(con, "INSERT OR REPLACE INTO _morie_metadata VALUES (?,?,?,?,?,?,?,?,?,?)", - params = list(table_name, entry$source, entry$survey, entry$year, - entry$format, n, length(cols), - jsonlite::toJSON(cols, auto_unbox = FALSE), - format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ"), "")) + # Write metadata. + cols <- DBI::dbListFields(con, table_name) + DBI::dbExecute(con, "INSERT OR REPLACE INTO _morie_metadata VALUES (?,?,?,?,?,?,?,?,?,?)", + params = list( + table_name, entry$source, entry$survey, entry$year, + entry$format, n, length(cols), + jsonlite::toJSON(cols, auto_unbox = FALSE), + format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ"), "" + ) + ) - cat(sprintf(" OK: %s rows, %d cols -> %s\n", format(n, big.mark = ","), length(cols), table_name)) - ingested <- ingested + 1L - }, error = function(e) { - cat(sprintf(" ERROR: %s\n", conditionMessage(e))) - skipped <<- skipped + 1L - }) + cat(sprintf(" OK: %s rows, %d cols -> %s\n", format(n, big.mark = ","), length(cols), table_name)) + ingested <- ingested + 1L + }, + error = function(e) { + cat(sprintf(" ERROR: %s\n", conditionMessage(e))) + skipped <<- skipped + 1L + } + ) } cat(sprintf("\nDone: %d ingested, %d skipped\n", ingested, skipped)) diff --git a/r-package/morie/data-raw/make_summary_data.R b/r-package/morie/data-raw/make_summary_data.R index 68171fbe9b..f71696dec8 100644 --- a/r-package/morie/data-raw/make_summary_data.R +++ b/r-package/morie/data-raw/make_summary_data.R @@ -12,16 +12,22 @@ cat("dataset_catalog:", nrow(dataset_catalog), "entries\n") # 2. Substance categories (from CSUS healthinfobase files) substance_categories <- data.frame( - key = c("alcohol", "cannabis", "opioids", "stimulants", "sedatives", - "polysubstance", "smoking_vaping", "illegal", "harms", "otc", "treatment"), - label = c("Alcohol", "Cannabis", "Opioids", "Stimulants", "Sedatives", - "Polysubstance Use", "Cigarette Smoking & Vaping", - "Illegal Substances", "Substance Use Harms", - "Over-the-Counter Products", "Treatment"), - source_file = c("Alcohol.csv", "Cannabis.csv", "Opioids.csv", "Stimulants.csv", - "Sedatives.csv", "Polysubstance.csv", "Cigarette smoking and vaping.csv", - "Illegal substances.csv", "Substance use harms.csv", - "Over the counter products.csv", "Treatment.csv"), + key = c( + "alcohol", "cannabis", "opioids", "stimulants", "sedatives", + "polysubstance", "smoking_vaping", "illegal", "harms", "otc", "treatment" + ), + label = c( + "Alcohol", "Cannabis", "Opioids", "Stimulants", "Sedatives", + "Polysubstance Use", "Cigarette Smoking & Vaping", + "Illegal Substances", "Substance Use Harms", + "Over-the-Counter Products", "Treatment" + ), + source_file = c( + "Alcohol.csv", "Cannabis.csv", "Opioids.csv", "Stimulants.csv", + "Sedatives.csv", "Polysubstance.csv", "Cigarette smoking and vaping.csv", + "Illegal substances.csv", "Substance use harms.csv", + "Over the counter products.csv", "Treatment.csv" + ), stringsAsFactors = FALSE ) cat("substance_categories:", nrow(substance_categories), "entries\n") @@ -29,12 +35,16 @@ cat("substance_categories:", nrow(substance_categories), "entries\n") # 3. CKAN metadata for open data API access ckan_metadata <- data.frame( survey = c("cpads", "csads", "csus"), - name = c("Canadian Postsecondary Education Alcohol and Drug Use Survey", - "Canadian Student Alcohol and Drugs Survey", - "Canadian Substance Use Survey"), - package_id = c("736fa9b2-62e4-4e31-aea4-51869605b363", - "1f15ca45-8bfd-4f9c-9ec6-2c0c440e69c2", - "65e2d45e-efc6-4c29-9a9b-db59bc96aa0e"), + name = c( + "Canadian Postsecondary Education Alcohol and Drug Use Survey", + "Canadian Student Alcohol and Drugs Survey", + "Canadian Substance Use Survey" + ), + package_id = c( + "736fa9b2-62e4-4e31-aea4-51869605b363", + "1f15ca45-8bfd-4f9c-9ec6-2c0c440e69c2", + "65e2d45e-efc6-4c29-9a9b-db59bc96aa0e" + ), metadata_url = c( "https://open.canada.ca/data/api/action/package_show?id=736fa9b2-62e4-4e31-aea4-51869605b363", "https://open.canada.ca/data/api/action/package_show?id=1f15ca45-8bfd-4f9c-9ec6-2c0c440e69c2", diff --git a/r-package/morie/inst/WORDLIST b/r-package/morie/inst/WORDLIST new file mode 100644 index 0000000000..635b52b1a4 --- /dev/null +++ b/r-package/morie/inst/WORDLIST @@ -0,0 +1,79 @@ +AIPW +ATC +ATE +ATT +CATE +CIHI +CPADS +CTGAN +Carceral +DBI +DOI +DuckDB +GATE +GP +GitHub +Hawkes +IRT +LATE +LISA +MORIE +MRM +McNamara +Medina +Methodology +Multilevel +NIBRS +OTIS +PUMF +PUMFs +PostgreSQL +Reconciliation +Roxygen +Rosenbaum +SIU +SPDX +SQLite +SciencesPo +SoE +Sociolegal +Sprott +StatCan +TPS +Toronto +UTC +URL +URLs +VCS +carceral +cryptographic +debiaser +debiasing +doi +duckdb +filespace +fr +ipw +licensable +linkage +lintr +metadata +microdata +multiarch +multivariate +neighbourhood +pkgcheck +psychometric +psychometrics +relicensed +roxygen +roxygenise +rOpenSci +scikit +sociolegal +spatiotemporal +testthat +tidyverse +tooling +unsystem +yo diff --git a/r-package/morie/inst/doc/causal-inference.html b/r-package/morie/inst/doc/causal-inference.html deleted file mode 100644 index e2b4a400ad..0000000000 --- a/r-package/morie/inst/doc/causal-inference.html +++ /dev/null @@ -1,511 +0,0 @@ - - - - - - - - - - - - - -Causal inference with MORIE - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

This vignette walks through the causal-inference surface of MORIE: -ATE, ATT, ATC, augmented IPW (doubly robust), per-unit CATE, group GATE, -and sensitivity analysis (the E-value). Each estimator follows the same -calling convention \u2014 data, treatment, -outcome, covariates \u2014 so swapping among them -is a matter of changing one function name.

-
-
-

A reproducible synthetic example

-
library(morie)
-set.seed(2026)
-
-n <- 800
-X1 <- rnorm(n)
-X2 <- rnorm(n)
-# Confounded treatment assignment
-ps    <- plogis(0.5 * X1 - 0.3 * X2)
-treat <- as.integer(ps > runif(n))
-# True ATE = +1.0
-y <- 1.0 * treat + 0.7 * X1 - 0.2 * X2 + rnorm(n, sd = 0.5)
-
-df <- data.frame(y = y, treat = treat, X1 = X1, X2 = X2)
-
-
-

Single-robust ATE / ATT / ATC

-
ate <- estimate_ate(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"))
-att <- estimate_att(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"))
-atc <- estimate_atc(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"))
-
-ate$estimate
-#> NULL
-att$estimate
-#> NULL
-atc$estimate
-#> NULL
-

The three quantities estimate slightly different things:

-
    -
  • ATE (average treatment effect): averaged over the -full population.
  • -
  • ATT (treated): averaged over the treated -subgroup.
  • -
  • ATC (controls): averaged over the untreated -subgroup.
  • -
-

Under no effect heterogeneity in the covariates, all three converge. -Under heterogeneity, they diverge in informative ways.

-
-
-

Doubly robust: augmented IPW

-

estimate_aipw() is the augmented -inverse-probability-weighting estimator. It is consistent if either the -propensity model or the outcome model is correctly specified \u2014 the -doubly-robust guarantee.

-
aipw <- estimate_aipw(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"))
-aipw$estimate
-#> NULL
-aipw$se
-#> [1] 0.04033896
-
-
-

Per-unit CATE (T-learner / S-learner)

-
cate <- estimate_cate(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"),
-                      meta_learner = "t_learner")
-head(cate)
-#> [1] 1.0534077 0.9977113 1.0363683 1.0293321 0.9726929 0.9714638
-

Each row of the returned data frame contains a per-unit conditional -average treatment effect.

-
-
-

Group-level GATE

-
df$g <- sample(c("A", "B", "C"), nrow(df), replace = TRUE)
-gate <- estimate_gate(df, treatment = "treat", outcome = "y",
-                      covariates = c("X1", "X2"),
-                      group_col = "g")
-gate
-#>   group       ate         se  ci_lower ci_upper   n
-#> 1     B 1.0249270 0.06242292 0.9025781 1.147276 274
-#> 2     A 1.0514470 0.07314839 0.9080762 1.194818 265
-#> 3     C 0.9340143 0.06686665 0.8029557 1.065073 261
-

The result is one row per group level with ate, -se, and CI.

-
-
-

Sensitivity analysis: the E-value

-
# Suppose the observed risk ratio is 2.0; how large would an
-# unmeasured confounder need to be to explain it away?
-evalue <- e_value(rr = 2.0)
-evalue
-#> $e_value
-#> [1] 3.414214
-#> 
-#> $e_value_ci
-#> [1] NA
-

The E-value is the minimum strength (on the risk-ratio scale) that an -unmeasured confounder would need on both treatment and outcome to fully -explain the observed effect.

-
-
-

Where to go next

-
    -
  • The companion mrm-otis-walkthrough vignette applies the -full ten-estimator MRM ensemble to OTIS provincial data.
  • -
  • For survey-weighted versions of these estimators, see the -survey-weighted vignette.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/chi-square-and-anova.html b/r-package/morie/inst/doc/chi-square-and-anova.html deleted file mode 100644 index 7563ba80d9..0000000000 --- a/r-package/morie/inst/doc/chi-square-and-anova.html +++ /dev/null @@ -1,476 +0,0 @@ - - - - - - - - - - - - - -Chi-square tests and one-way ANOVA - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

For aggregate contingency tables \u2014 the kind that sit behind the Doob -\(\chi^{2}\) family in the MRM -framework \u2014 MORIE exposes chi_square_test() and the -companion effect-size helpers (Cramer\u2019s V, omega-squared) as first-class -functions.

-
-
-

A 2x3 contingency table

-
library(morie)
-tab <- matrix(c(20, 10, 30,
-                15, 25, 35), nrow = 2, byrow = TRUE,
-              dimnames = list(group = c("treated", "control"),
-                              outcome = c("A", "B", "C")))
-
-result <- chi_square_test(tab)
-result$statistic
-#> NULL
-result$p_value
-#> [1] 0.05145575
-result$df
-#> [1] 2
-

chi_square_test() returns the Pearson \(\chi^{2}\), the asymptotic p-value, and the -degrees of freedom in a tidy list.

-
-
-

Effect size: Cramer\u2019s V

-
v <- cramers_v(tab)
-v
-#> [1] 0.209657
-

Cramer\u2019s V scales the chi-square statistic to a 0\u20131 association -measure that is more interpretable than the raw statistic.

-
-
-

One-way ANOVA + omega-squared

-

For continuous outcomes across more than two groups, the one-way -ANOVA pattern in R is aov(y ~ g, data = ...). MORIE exposes -omega_squared() as the less-biased counterpart to -eta-squared:

-
set.seed(2)
-n      <- 90
-group  <- rep(c("A", "B", "C"), each = n / 3)
-y      <- rnorm(n) + ifelse(group == "C", 0.6, 0)
-
-fit  <- stats::aov(y ~ group)
-fsum <- summary(fit)[[1]]
-fstat   <- fsum$`F value`[1]
-df_b    <- fsum$Df[1]
-df_w    <- fsum$Df[2]
-
-omega_squared(f_stat = fstat, df_between = df_b, df_within = df_w, n = n)
-#> [1] -0.00188636
-
-
-

Doob \(\chi^{2}\) family

-

The MRM framework\u2019s \u201cDoob \(\chi^{2}\) family\u201d is a coordinated set of -chi-square tests on the published Sprott / Doob / Iftene contingency -tables (federal SIU operation, COVID-period operation, -torture-classification rates, IEDM analyses). MORIE reproduces all five -published statistics to within \(0.01\) -of the printed values; details are in the MRM paper -(citation("morie")).

-
-
-

Where to go next

-
    -
  • The effect-sizes vignette covers Cohen\u2019s d, Cramer\u2019s V, -omega-squared, proportion-CI, and the E-value in one place.
  • -
  • The mrm-otis-walkthrough vignette shows the chi-square -family applied to OTIS provincial data.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/cpads-canonicalization.html b/r-package/morie/inst/doc/cpads-canonicalization.html deleted file mode 100644 index ad7f8e0afa..0000000000 --- a/r-package/morie/inst/doc/cpads-canonicalization.html +++ /dev/null @@ -1,462 +0,0 @@ - - - - - - - - - - - - - -CPADS canonicalization and analysis - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

The Canadian Postsecondary Education Alcohol and Drug Use Survey -(CPADS) is one of the Statistics Canada PUMFs that MORIE supports out of -the box. Variable names, value codes, and survey weights differ across -cycles, so MORIE provides a canonical column contract and a -canonicalize_cpads_data() helper to harmonise cycles into a -single analysis-ready tibble.

-
-
-

The CPADS column contract

-
library(morie)
-contract <- cpads_contract()
-str(contract, max.level = 2)
-

cpads_contract() returns the canonical names, value-code -maps, and survey-weight columns. Using the contract is opt-in \u2014 the -estimators in MORIE do not require it \u2014 but it lets you write analysis -code once and run it across cycles unchanged.

-
-
-

Loading + canonicalising

-
raw_2122 <- morie_load_dataset("cpads-2122")
-df       <- canonicalize_cpads_data(raw_2122)
-
-# Validates that all canonical columns are present + correctly
-# typed. Returns silently if OK, or stops with a clear message
-# pointing at the offending column.
-validate_cpads_data(df)
-
-
-

A simple analysis with weights

-
# CPADS ships PUMF weights in a column the contract surfaces.
-weighted_freq <- mean(df$heavy_drinking_30d * df$pumf_weight,
-                      na.rm = TRUE)
-weighted_freq
-
-
-

Survey-weighted causal estimate

-
# Estimate ATE of (canonical-treatment) on
-# (canonical-outcome), passing CPADS PUMF weights:
-ate <- estimate_ate(df,
-                    outcome    = "heavy_drinking_30d",
-                    treatment  = "treat_canonical",
-                    covariates = c("age", "sex", "region"),
-                    weights    = "pumf_weight")
-ate$estimate
-
-
-

Where to go next

-
    -
  • The survey-weighted vignette covers complex-sample -sampling (stratified, cluster, PPS), bootstrap CIs, and design -effects.
  • -
  • The causal-inference vignette covers the full ATE / ATT -/ ATC / AIPW / CATE / GATE estimator family.
  • -
  • For Statistics Canada citation requirements, see the README\u2019s -data-acknowledgment block.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/dataset-catalog.html b/r-package/morie/inst/doc/dataset-catalog.html deleted file mode 100644 index 681c356fa4..0000000000 --- a/r-package/morie/inst/doc/dataset-catalog.html +++ /dev/null @@ -1,461 +0,0 @@ - - - - - - - - - - - - - -Dataset catalogue - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

MORIE ships a portable SQLite layer with dozens of built-in datasets -covering Canadian carceral, police, oversight, and public- health -surveillance corpora. This vignette shows how to discover and load them -from R.

-
-
-

Browsing the catalogue

-
library(morie)
-catalog <- morie_dataset_catalog()
-head(catalog)
-

Each row of the returned data frame describes one dataset: identifier -(e.g.\u00a0otis-2025, cpads-2122), source, year, -number of rows, and a short description.

-
-
-

Per-dataset detail

-
morie_dataset_info("cpads-2122")
-

morie_dataset_info() returns a list with the variable -names, labels, value codes, citation, and any data-acknowledgment -disclaimer required by the original publisher.

-
-
-

Loading data

-
df <- morie_load_dataset("cpads-2122")
-dim(df)
-

morie_load_dataset() returns a tibble. Public-use -datasets shipped inside the package require no further -configuration.

-
-
-

Configuring local + remote backends

-

For datasets backed by external SQLite mirrors:

-
    -
  • Set MORIE_LOCAL_DB_DIR to a directory of -.sqlite files for fast offline access.
  • -
  • Set MORIE_REMOTE_URL to an HTTP SQL-over-REST endpoint -for network-only exploration.
  • -
-
-
-

Statistics Canada / Health Canada data acknowledgment

-

Several datasets in the catalogue are derived from Statistics Canada -and Health Canada PUMFs (CCS, CSADS, CSUS, CADS, CPADS). The standard -disclaimer applies: although the analyses use Statistics Canada / Health -Canada data, the analyses, interpretations, and conclusions are those of -the analyst and do not represent the views of either agency.

-
-
-

Where to go next

-
    -
  • The intro vignette uses -morie_load_dataset() end-to-end.
  • -
  • The cpads-canonicalization vignette covers the CPADS -column contract and canonicalize_cpads_data() helpers.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/effect-sizes.html b/r-package/morie/inst/doc/effect-sizes.html deleted file mode 100644 index f5c836719c..0000000000 --- a/r-package/morie/inst/doc/effect-sizes.html +++ /dev/null @@ -1,498 +0,0 @@ - - - - - - - - - - - - - -Effect sizes and association measures - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

Effect sizes complement p-values: they answer \u201chow big is the -effect?\u201d rather than \u201cis there an effect at all?\u201d. MORIE exposes the -standard families used in carceral, public-health, and sociolegal -research.

-
-
-

Cohen\u2019s d (continuous, two-group)

-
library(morie)
-set.seed(7)
-group_a <- rnorm(60, mean = 0.0)
-group_b <- rnorm(60, mean = 0.6)
-
-d <- cohens_d(group_a, group_b)
-d
-#> [1] -0.5043227
-

Cohen\u2019s d expresses the difference between two means in pooled -standard-deviation units. Conventional benchmarks: 0.2 small, 0.5 -medium, 0.8 large.

-
-
-

Cramer\u2019s V (categorical association)

-
tab <- matrix(c(20, 10, 30,
-                15, 25, 35), nrow = 2, byrow = TRUE)
-v <- cramers_v(tab)
-v
-#> [1] 0.209657
-

Cramer\u2019s V scales the chi-square statistic to a 0\u20131 association -measure for contingency tables. It is particularly useful for the -provincial-vs-federal Mandela-rate cross-comparisons in MRM.

-
-
-

Omega-squared (one-way ANOVA)

-

Omega-squared is a less-biased effect-size estimator than eta-squared -for a one-way ANOVA design.

-
omega_squared(f_stat = 5.2, df_between = 2, df_within = 87, n = 90)
-#> [1] 0.08536585
-
-
-

Proportion confidence intervals

-

For a binomial proportion, MORIE exposes Wilson, Clopper\u2013Pearson -exact, and Wald CIs. Wilson is the default and is what we recommend in -published papers (it has better small-sample coverage than Wald).

-
proportion_ci(35, 100)                       # Wilson, 95% CI
-#> $p_hat
-#> [1] 0.35
-#> 
-#> $ci_lower
-#> [1] 0.2636425
-#> 
-#> $ci_upper
-#> [1] 0.4474556
-proportion_ci(35, 100, method = "exact")     # Clopper-Pearson
-#> $p_hat
-#> [1] 0.35
-#> 
-#> $ci_lower
-#> [1] 0.2572938
-#> 
-#> $ci_upper
-#> [1] 0.4518494
-proportion_ci(35, 100, method = "wald")      # Wald
-#> $p_hat
-#> [1] 0.35
-#> 
-#> $ci_lower
-#> [1] 0.2565157
-#> 
-#> $ci_upper
-#> [1] 0.4434843
-
-
-

E-value (sensitivity analysis)

-
e_value(rr = 2.0)
-#> $e_value
-#> [1] 3.414214
-#> 
-#> $e_value_ci
-#> [1] NA
-

The E-value is the minimum strength of association that an unmeasured -confounder would need on both treatment and outcome to fully explain -away an observed risk ratio.

-
-
-

Where to go next

-
    -
  • For survey-weighted versions of cohens_d and -cramers_v, see the survey-weighted -vignette.
  • -
  • For full causal-inference workflows that include ATE / ATT / ATC -effect sizes with CIs, see the causal-inference -vignette.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/intro.html b/r-package/morie/inst/doc/intro.html deleted file mode 100644 index 9b65f15423..0000000000 --- a/r-package/morie/inst/doc/intro.html +++ /dev/null @@ -1,518 +0,0 @@ - - - - - - - - - - - - - -Getting started with MORIE in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

MORIE is a multi-domain scientific-computing toolkit with parallel -Python and R packages. The R package mirrors a substantial subset of the -Python package, focused on the surfaces that are most useful from within -an R workflow: dataset loading, causal estimators, survey sampling and -weighting, basic spectral analysis, and helpers for the MRM -(McNamara\u2013Ruhela\u2013Medina) framework that is MORIE\u2019s primary -sociolegal-data application.

-

This vignette walks through a minimal end-to-end session: load the -package, look at the bundled dataset catalogue, load one dataset, and -run an average-treatment-effect estimator on a small synthetic example. -A second vignette (mrm-otis-walkthrough) covers the MRM -ten-estimator ensemble on OTIS provincial data.

-
-
-

Loading the package

-
library(morie)
-
-
-

The dataset catalogue

-

morie_dataset_catalog() returns a data frame summarising -every dataset bundled with the package or accessible via the package\u2019s -loaders. This is the easiest way to discover what\u2019s available without -leaving the R session.

-
catalog <- morie_dataset_catalog()
-head(catalog)
-

For details on a single dataset (variables, source, citation), use -morie_dataset_info():

-
morie_dataset_info("cpads-2122")
-
-
-

Loading a dataset

-

morie_load_dataset() returns a tibble (or data frame) -for any dataset in the catalogue. Public-use datasets that ship inside -the package require no further configuration; for datasets backed by -remote SQLite mirrors, configure MORIE_LOCAL_DB_DIR (local -directory of .sqlite files) or -MORIE_REMOTE_URL (HTTP endpoint).

-
df <- morie_load_dataset("cpads-2122")
-dim(df)
-
-
-

A simple ATE estimate

-

For users who already have a treatment / outcome / covariate dataset -in hand, the estimators are designed to work on any tibble or data frame -\u2014 there is no hard-coded column-name convention. The example below is -fully synthetic and runnable without any external data.

-
set.seed(2026)
-n <- 500
-X1 <- rnorm(n)
-X2 <- rnorm(n)
-# Confounded treatment assignment.
-treat <- as.integer(plogis(0.5 * X1 - 0.3 * X2) > runif(n))
-# Outcome with a true ATE of +1.0 plus covariate effects.
-y <- 1.0 * treat + 0.7 * X1 - 0.2 * X2 + rnorm(n, sd = 0.5)
-
-df_synth <- data.frame(y = y, treat = treat, X1 = X1, X2 = X2)
-result <- estimate_ate(
-  data       = df_synth,
-  outcome    = "y",
-  treatment  = "treat",
-  covariates = c("X1", "X2")
-)
-print(result)
-#> $ate
-#> [1] 0.9526445
-#> 
-#> $se
-#> [1] 0.05200402
-#> 
-#> $ci_lower
-#> [1] 0.8507166
-#> 
-#> $ci_upper
-#> [1] 1.054572
-#> 
-#> $n
-#> [1] 500
-#> 
-#> $ess
-#> [1] 462.4943
-

The returned object is a list with the point estimate, standard -error, confidence interval, and the underlying nuisance fits, in the -RichResult-compatible structure described in the Python -package paper.

-
-
-

Companion estimators

-

estimate_att(), estimate_atc(), and -estimate_aipw() follow the same calling convention. The -augmented IPW estimator (estimate_aipw()) is doubly robust -under correct specification of either the propensity model or the -outcome model.

-
result_aipw <- estimate_aipw(
-  data       = df_synth,
-  outcome    = "y",
-  treatment  = "treat",
-  covariates = c("X1", "X2")
-)
-print(result_aipw)
-
-
-

Where to go next

-
    -
  • The mrm-otis-walkthrough vignette demonstrates the -ten-estimator MRM ensemble on Ontario OTIS provincial -restrictive-confinement microdata.
  • -
  • The MORIE package paper describes the wider scope of the toolkit -beyond R: signal processing, cryptography, spatial statistics, -statistical-physics-of-crime models, psychometrics, and the full Python -interface.
  • -
  • Citation: see citation("morie").
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/ipw-deep-dive.html b/r-package/morie/inst/doc/ipw-deep-dive.html deleted file mode 100644 index 6f04ca9398..0000000000 --- a/r-package/morie/inst/doc/ipw-deep-dive.html +++ /dev/null @@ -1,489 +0,0 @@ - - - - - - - - - - - - - -IPW deep-dive (Hajek and Horvitz\u2013Thompson) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

Inverse-probability weighting (IPW) is the simplest of the -single-robust causal estimators. This vignette shows the building blocks -that MORIE exposes: the Horvitz\u2013Thompson and the Hajek-stabilised IPW -estimators, propensity-score modelling, and weight-trimming -diagnostics.

-
-
-

Setting up

-
library(morie)
-set.seed(2026)
-n <- 500
-X1 <- rnorm(n)
-X2 <- rnorm(n)
-ps_true <- plogis(0.4 * X1 - 0.3 * X2)
-treat   <- as.integer(ps_true > runif(n))
-y       <- 1.0 * treat + 0.6 * X1 - 0.2 * X2 + rnorm(n, sd = 0.5)
-df <- data.frame(y = y, treat = treat, X1 = X1, X2 = X2)
-
-
-

Estimating propensities

-

The estimate_ate() machinery fits a logistic propensity -model internally and returns the IPW estimate by default. To inspect the -propensities, set propensity_col after fitting them:

-
ps_fit <- glm(treat ~ X1 + X2, family = binomial(), data = df)
-df$ps  <- predict(ps_fit, type = "response")
-summary(df$ps)
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>  0.1604  0.3727  0.4549  0.4580  0.5374  0.7422
-
-
-

Hajek-stabilised IPW

-

estimate_ate() defaults to the Hajek estimator, which -divides each weighted sum by the corresponding sum of weights. This -stabilises the estimator under finite samples even when the propensity -tails are heavy:

-
ate_hajek <- estimate_ate(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"),
-                          propensity_col = "ps")
-ate_hajek$estimate
-#> NULL
-ate_hajek$se
-#> [1] 0.04755316
-
-
-

Weight diagnostics

-

In practice, IPW is sensitive to extreme propensities. Two common -diagnostics:

-
# Effective sample size after weighting
-ess <- effective_sample_size(1 / df$ps)
-ess
-#> [1] 458.4318
-
-# Range of weights (extreme means trimming)
-range(1 / df$ps)
-#> [1] 1.347363 6.234294
-

If the effective sample size collapses dramatically, the analysis -should consider:

-
    -
  • Trimming propensities to a sensible interval (e.g.\u00a0[0.05, -0.95])
  • -
  • Switching to a doubly-robust estimator -(estimate_aipw())
  • -
  • Adding more covariates to better separate the treatment groups
  • -
-
-
-

AIPW for protection against IPW failure

-
aipw <- estimate_aipw(df, treatment = "treat", outcome = "y", covariates = c("X1", "X2"))
-aipw$estimate
-#> NULL
-

When propensities are well-behaved, IPW and AIPW should agree to -within Monte Carlo noise. Disagreement is informative: it suggests -either model misspecification or a fragile propensity model.

-
-
-

Where to go next

-
    -
  • The causal-inference vignette covers ATT / ATC / CATE / -GATE.
  • -
  • The survey-weighted vignette covers IPW under -complex-sample designs (when survey weights and propensities both -apply).
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/mrm-dataset-fetchers.Rmd b/r-package/morie/inst/doc/mrm-dataset-fetchers.Rmd deleted file mode 100644 index b95ca382ab..0000000000 --- a/r-package/morie/inst/doc/mrm-dataset-fetchers.Rmd +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: "MRM dataset fetchers and bundled samples" -output: - rmarkdown::html_vignette: - toc: true -vignette: > - %\VignetteIndexEntry{MRM dataset fetchers and bundled samples} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - eval = requireNamespace("morie", quietly = TRUE) -) -``` - -# Overview - -The MRM empirical callables (`mrm_otis_*`, `mrm_tps_*`, -`mrm_siu_*`) operate on three external data sources. This vignette -documents how `morie` makes each of them accessible to end users: - -* **OTIS** (Offender Tracking Information System; published by the - Ontario Ministry of the Solicitor General) \u2014 public CKAN - release on `data.ontario.ca`, reached via the existing - `morie_load_dataset()` infrastructure. -* **TPS** (Toronto Police Service) \u2014 public ArcGIS Open Data, reached - via `morie_fetch_tps()`. -* **SIU** (Ontario Special Investigations Unit) \u2014 public Director's - Reports, scraped on demand by `morie_fetch_siu()` (the corpus is - not shipped because redistribution licensing is unsettled). - -Four small reference samples are bundled with the package -(`inst/extdata/`) so every example runs offline. - -```{r intro} -library(morie) -``` - -# Bundled reference samples - -The samples live in `inst/extdata/` and total ~420 KB. Each is a -1000-row random draw (seed 42) except `otis_b09` and `otis_c11`, -which are shipped whole (already small). - -```{r samples} -# Bundled samples -b01 <- morie_sample("otis_b01") -b09 <- morie_sample("otis_b09") -c11 <- morie_sample("otis_c11") -tps <- morie_sample("tps_assault") - -# Schema sanity -str(b01, max.level = 1) -nrow(b09); nrow(c11) -ncol(tps) -``` - -# OTIS via CKAN (`data.ontario.ca`) - -The OTIS public release is hosted at -`data.ontario.ca/dataset/data-on-inmates-in-ontario` as 28 CSV -resources. The MORIE catalog has four of them registered with their -canonical CKAN resource IDs: - -```{r otis-keys} -cat <- morie_dataset_catalog() -cat[cat$source == "otis", c("key", "name", "table_name", "ckan_resource_id")] -``` - -To pull the full b01 (82,001 rows) from CKAN: - -```{r otis-load, eval = FALSE} -b01_full <- morie_load_dataset("otisb01") -nrow(b01_full) -# 82001 -``` - -The loader downloads the CSV on first call, caches it into the -package SQLite database, and returns a data.frame on subsequent -calls. - -# TPS via ArcGIS Open Data - -Toronto Police Service publishes per-category crime events through -ArcGIS Online. MORIE knows the layer URLs for nine categories. - -```{r tps-urls} -morie_tps_layer_urls() -``` - -To fetch a single category: - -```{r tps-fetch, eval = FALSE} -csv_path <- morie_fetch_tps("Assault") -assault <- utils::read.csv(csv_path) -nrow(assault) -# 254378 (as of mid-2026) -``` - -The fetcher pages through `/query` with a 2000-record-per-page cap -(the ArcGIS-imposed maximum) and writes a tidy CSV to -`~/.cache/morie/tps/tps_Assault.csv`. Subsequent calls return the -cached path unless `overwrite = TRUE`. - -Filtering at the server side: - -```{r tps-filter, eval = FALSE} -recent <- morie_fetch_tps("Homicides", - where = "OCC_YEAR >= 2024", - overwrite = TRUE) -``` - -# SIU via on-demand scraper - -The Ontario SIU publishes Director's Reports at -`siu.on.ca/en/case_directors_reports.php`. MORIE includes an -on-demand scraper that: - -* Iterates over the index page(s) (optionally filtered by year) -* Extracts case links -* Pulls each case detail page -* Parses incident date, notifying police service, Director's - decision, and outcome text into a single CSV - -```{r siu, eval = FALSE} -csv_path <- morie_fetch_siu() # full unfiltered index -siu <- utils::read.csv(csv_path) -nrow(siu) - -# Or restricted to specific years: -csv_path <- morie_fetch_siu(years = 2020:2025, overwrite = TRUE) -``` - -## Why a scraper rather than a shipped dataset? - -The legal status of redistributing a single tabular copy of public -oversight reports is not clearly established. Running the scraper -per-user is unambiguously fair use of public information; bundling -the scraped corpus might be more questionable. The scraper itself -respects a 2-second rate limit, sets a clear User-Agent, and follows -the SIU site's published structure. - -## R-side wrapper, Python-side implementation - -`morie_fetch_siu()` in R is a thin `reticulate` wrapper around -`morie.siu_fetch.fetch_siu_cases()` in Python. This keeps the regex -parsing logic in one canonical place. If `reticulate` isn't -installed, fall back to calling the Python directly: - -```{bash, eval = FALSE} -python3 -c "from morie.siu_fetch import fetch_siu_cases; print(fetch_siu_cases())" -``` - -# Workflow: from bundled sample to verified result - -Putting it together \u2014 a complete worked example without any network -call: - -```{r workflow} -b01 <- morie_sample("otis_b01") - -# Mandela classification with default "individual_any" denominator -mrm_classify_mandela(b01, denominator = "row") - -# Segregation duration KM -mrm_otis_seg_duration_km(b01, - group_cols = "MentalHealth_Alert") - -# Mortification co-occurrence: Cramer's V across alert pairs -mrm_otis_mortification_cooccurrence(b01) -``` - -For a full-data version, swap `morie_sample("otis_b01")` for -`morie_load_dataset("otisb01")` and re-run the same callables. - -# Caching layout - -| Cache path | Populated by | Size (full) | -|---|---|---| -| `~/.cache/morie/morie.db` (SQLite) | `morie_load_dataset(*)` | a few MB to ~1 GB depending on selection | -| `~/.cache/morie/tps/tps_*.csv` | `morie_fetch_tps()` | ~5--50 MB per category | -| `~/.cache/morie/siu/SIU.csv` | `morie_fetch_siu()` | ~5 MB | - -# References - -* OTIS data dictionary \u2014 - `data.ontario.ca/dataset/data-on-inmates-in-ontario` -* Toronto Police Open Data \u2014 `data.torontopolice.on.ca/` -* SIU Director's Reports \u2014 `siu.on.ca/en/case_directors_reports.php` -* MRM theoretical paper \u2014 Zenodo DOI - [10.5281/zenodo.20096075](https://doi.org/10.5281/zenodo.20096075) diff --git a/r-package/morie/inst/doc/mrm-dataset-fetchers.html b/r-package/morie/inst/doc/mrm-dataset-fetchers.html deleted file mode 100644 index 71a19aa5e3..0000000000 --- a/r-package/morie/inst/doc/mrm-dataset-fetchers.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - - -MRM dataset fetchers and bundled samples - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

The MRM empirical callables (mrm_otis_*, -mrm_tps_*, mrm_siu_*) operate on three -external data sources. This vignette documents how morie -makes each of them accessible to end users:

-
    -
  • OTIS (Offender Tracking Information System; -published by the Ontario Ministry of the Solicitor General) 014 public -CKAN release on data.ontario.ca, reached via the existing -morie_load_dataset() infrastructure.
  • -
  • TPS (Toronto Police Service) 014 public ArcGIS Open -Data, reached via morie_fetch_tps().
  • -
  • SIU (Ontario Special Investigations Unit) 014 -public Director’s Reports, scraped on demand by -morie_fetch_siu() (the corpus is not shipped because -redistribution licensing is unsettled).
  • -
-

Four small reference samples are bundled with the package -(inst/extdata/) so every example runs offline.

-
library(morie)
-
-
-

Bundled reference samples

-

The samples live in inst/extdata/ and total ~420 KB. -Each is a 1000-row random draw (seed 42) except otis_b09 -and otis_c11, which are shipped whole (already small).

-
# Bundled samples
-b01 <- morie_sample("otis_b01")
-b09 <- morie_sample("otis_b09")
-c11 <- morie_sample("otis_c11")
-tps <- morie_sample("tps_assault")
-
-# Schema sanity
-str(b01, max.level = 1)
-#> 'data.frame':    1000 obs. of  18 variables:
-#>  $ EndFiscalYear                                         : int  2025 2023 2023 2023 2024 2024 2025 2025 2024 2023 ...
-#>  $ UniqueIndividual_ID                                   : chr  "2025-08960-SG" "2023-11152-SG" "2023-08955-SG" "2023-11360-SG" ...
-#>  $ Gender                                                : chr  "Male" "Male" "Female" "Male" ...
-#>  $ Region_AtTimeOfPlacement                              : chr  "Eastern" "Northern" "Central" "Central" ...
-#>  $ Region_MostRecentPlacement                            : chr  "Eastern" "Northern" "Central" "Central" ...
-#>  $ Age_Category                                          : chr  "25 to 49" "25 to 49" "25 to 49" "25 to 49" ...
-#>  $ NumberConsecutiveDays_Segregation                     : int  4 1 6 1 2 1 1 1 2 3 ...
-#>  $ SegReason_SecurityOfInstitution_SafetyOfOthers        : chr  "No" "No" "No" "No" ...
-#>  $ SegReason_InmateNeedsProtection                       : chr  "No" "Yes" "No" "No" ...
-#>  $ SegReason_InmateNeedsProtection_Medical               : chr  "Yes" "No" "No" "No" ...
-#>  $ SegReason_SecurityOfInstitution_SafetyOfOthers_Medical: chr  "No" "No" "Yes" "Yes" ...
-#>  $ SegReason_Disciplinary_Segregation                    : chr  "No" "No" "No" "No" ...
-#>  $ SegReason_InmateRefuseSearch_Scan                     : chr  "No" "No" "No" "No" ...
-#>  $ MentalHealth_Alert                                    : chr  "Yes" "No" "Yes" "No" ...
-#>  $ SuicideRisk_Alert                                     : chr  "Yes" "No" "Yes" "No" ...
-#>  $ SuicideWatch_Alert                                    : chr  "Yes" "No" "Yes" "No" ...
-#>  $ SegReason_Other                                       : chr  "No" "" "" "" ...
-#>  $ Number_Of_Placements                                  : int  1 8 1 1 1 2 2 1 2 1 ...
-nrow(b09); nrow(c11)
-#> [1] 78
-#> [1] 33
-ncol(tps)
-#> [1] 31
-
-
-

OTIS via CKAN (data.ontario.ca)

-

The OTIS public release is hosted at -data.ontario.ca/dataset/data-on-inmates-in-ontario as 28 -CSV resources. The MORIE catalog has four of them registered with their -canonical CKAN resource IDs:

-
cat <- morie_dataset_catalog()
-cat[cat$source == "otis", c("key", "name", "table_name", "ckan_resource_id")]
-#>        key                                                        name
-#> 37 otisa01        OTIS a01: Restrictive Confinement - Detailed Dataset
-#> 38 otisb01                    OTIS b01: Segregation - Detailed Dataset
-#> 39 otisb09 OTIS b09: Individuals in Segregation - Number of Placements
-#> 40 otisc11 OTIS c11: Individuals in Segregation/RC by Aggregate Length
-#>    table_name                     ckan_resource_id
-#> 37    otisa01 5a0c5804-a055-4031-9743-73f556e43bb4
-#> 38    otisb01 406e6d90-d568-4553-8ca7-bc9f90e133b9
-#> 39    otisb09 df24e943-d52b-43a8-a10e-a3cc906e26bb
-#> 40    otisc11 9c7b74a5-53ad-4ef0-a7a6-97772cd01c55
-

To pull the full b01 (82,001 rows) from CKAN:

-
b01_full <- morie_load_dataset("otisb01")
-nrow(b01_full)
-# 82001
-

The loader downloads the CSV on first call, caches it into the -package SQLite database, and returns a data.frame on subsequent -calls.

-
-
-

TPS via ArcGIS Open Data

-

Toronto Police Service publishes per-category crime events through -ArcGIS Online. MORIE knows the layer URLs for nine categories.

-
morie_tps_layer_urls()
-#>                                                                                                                       Assault 
-#>                         "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Assault_Open_Data/FeatureServer/0" 
-#>                                                                                                                     AutoTheft 
-#>                      "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Auto_Theft_Open_Data/FeatureServer/0" 
-#>                                                                                                                  BicycleTheft 
-#>                  "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Bicycle_Thefts_Open_Data/FeatureServer/0" 
-#>                                                                                                                 BreakAndEnter 
-#>                 "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Break_and_Enter_Open_Data/FeatureServer/0" 
-#>                                                                                                                     Homicides 
-#>        "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Homicides_Open_Data_ASR_RC_TBL_002/FeatureServer/0" 
-#>                                                                                                                       Robbery 
-#>                         "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Robbery_Open_Data/FeatureServer/0" 
-#>                                                                                                   ShootingAndFirearmDiscarges 
-#> "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Shooting_and_Firearm_Discharges_Open_Data/FeatureServer/0" 
-#>                                                                                                                   TheftFromMV 
-#>        "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Theft_From_Motor_Vehicle_Open_Data/FeatureServer/0" 
-#>                                                                                                                     TheftOver 
-#>                      "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Theft_Over_Open_Data/FeatureServer/0"
-

To fetch a single category:

-
csv_path <- morie_fetch_tps("Assault")
-assault <- utils::read.csv(csv_path)
-nrow(assault)
-# 254378 (as of mid-2026)
-

The fetcher pages through /query with a -2000-record-per-page cap (the ArcGIS-imposed maximum) and writes a tidy -CSV to ~/.cache/morie/tps/tps_Assault.csv. Subsequent calls -return the cached path unless overwrite = TRUE.

-

Filtering at the server side:

-
recent <- morie_fetch_tps("Homicides",
-                          where = "OCC_YEAR >= 2024",
-                          overwrite = TRUE)
-
-
-

SIU via on-demand scraper

-

The Ontario SIU publishes Director’s Reports at -siu.on.ca/en/case_directors_reports.php. MORIE includes an -on-demand scraper that:

-
    -
  • Iterates over the index page(s) (optionally filtered by year)
  • -
  • Extracts case links
  • -
  • Pulls each case detail page
  • -
  • Parses incident date, notifying police service, Director’s decision, -and outcome text into a single CSV
  • -
-
csv_path <- morie_fetch_siu()       # full unfiltered index
-siu <- utils::read.csv(csv_path)
-nrow(siu)
-
-# Or restricted to specific years:
-csv_path <- morie_fetch_siu(years = 2020:2025, overwrite = TRUE)
-
-

Why a scraper rather than a shipped dataset?

-

The legal status of redistributing a single tabular copy of public -oversight reports is not clearly established. Running the scraper -per-user is unambiguously fair use of public information; bundling the -scraped corpus might be more questionable. The scraper itself respects a -2-second rate limit, sets a clear User-Agent, and follows the SIU site’s -published structure.

-
-
-

R-side wrapper, Python-side implementation

-

morie_fetch_siu() in R is a thin reticulate -wrapper around morie.siu_fetch.fetch_siu_cases() in Python. -This keeps the regex parsing logic in one canonical place. If -reticulate isn’t installed, fall back to calling the Python -directly:

-
python3 -c "from morie.siu_fetch import fetch_siu_cases; print(fetch_siu_cases())"
-
-
-
-

Workflow: from bundled sample to verified result

-

Putting it together 014 a complete worked example without any network -call:

-
b01 <- morie_sample("otis_b01")
-
-# Mandela classification with default "individual_any" denominator
-mrm_classify_mandela(b01, denominator = "row")
-#>     year denominator n_mandela        rate  pct n_broader_rc rate_broader
-#> 1   2023         362         0 0.000000000 0.00            0  0.000000000
-#> 2   2024         337         3 0.008902077 0.89            3  0.008902077
-#> 3   2025         301         4 0.013289037 1.33            4  0.013289037
-#> 4 pooled        1000         7 0.007000000 0.70            7  0.007000000
-
-# Segregation duration KM
-mrm_otis_seg_duration_km(b01,
-                         group_cols = "MentalHealth_Alert")
-#>   stratum   n mean_days median_days q25_days pct_above_mandela
-#> 1      No 499      2.76           2        3               0.6
-#> 2     Yes 501      3.29           2        4               0.8
-#>   median_among_above_mandela
-#> 1                       54.0
-#> 2                       41.5
-
-# Mortification co-occurrence: Cramer's V across alert pairs
-mrm_otis_mortification_cooccurrence(b01)
-#>              alert_a            alert_b    n   chi2 df   p_value cramers_v
-#> 1 MentalHealth_Alert  SuicideRisk_Alert 1000  33.25  1  8.12e-09    0.1823
-#> 2 MentalHealth_Alert SuicideWatch_Alert 1000  12.09  1  5.08e-04    0.1099
-#> 3  SuicideRisk_Alert SuicideWatch_Alert 1000 470.37  1 2.66e-104    0.6858
-

For a full-data version, swap morie_sample("otis_b01") -for morie_load_dataset("otisb01") and re-run the same -callables.

-
-
-

Caching layout

- ----- - - - - - - - - - - - - - - - - - - - - - - - - -
Cache pathPopulated bySize (full)
~/.cache/morie/morie.db (SQLite)morie_load_dataset(*)a few MB to ~1 GB depending on selection
~/.cache/morie/tps/tps_*.csvmorie_fetch_tps()~5–50 MB per category
~/.cache/morie/siu/SIU.csvmorie_fetch_siu()~5 MB
-
-
-

References

-
    -
  • OTIS data dictionary 014 -data.ontario.ca/dataset/data-on-inmates-in-ontario
  • -
  • Toronto Police Open Data 014 -data.torontopolice.on.ca/
  • -
  • SIU Director’s Reports 014 -siu.on.ca/en/case_directors_reports.php
  • -
  • MRM theoretical paper 014 Zenodo DOI 10.5281/zenodo.20096075
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/mrm-empirical-callables.Rmd b/r-package/morie/inst/doc/mrm-empirical-callables.Rmd deleted file mode 100644 index 5c6c4d8b78..0000000000 --- a/r-package/morie/inst/doc/mrm-empirical-callables.Rmd +++ /dev/null @@ -1,199 +0,0 @@ ---- -title: "MRM empirical callables (OTIS / TPS / SIU)" -output: - rmarkdown::html_vignette: - toc: true -vignette: > - %\VignetteIndexEntry{MRM empirical callables (OTIS / TPS / SIU)} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - eval = requireNamespace("morie", quietly = TRUE) -) -``` - -# Overview - -This vignette documents the `mrm_otis_*()`, `mrm_tps_*()`, and -`mrm_siu_*()` empirical callables. Each function is a one-line entry -point to a verified analysis used in the *MRM empirical paper* -(Ruhela 2026, in preparation). Every example below runs on the small -reference samples bundled with the package, so the vignette is -network-free. - -For the full datasets: - -* OTIS \u2192 `morie_load_dataset("otisb01")` (downloads via CKAN on - first call; subsequent calls hit the local SQLite cache) -* TPS \u2192 `morie_fetch_tps("Assault")` (ArcGIS REST) -* SIU \u2192 `morie_fetch_siu()` (on-demand scrape of public reports) - -See `vignette("mrm-dataset-fetchers")` for the dataset side. - -```{r load} -library(morie) -b01 <- morie_sample("otis_b01") -b09 <- morie_sample("otis_b09") -tps <- morie_sample("tps_assault") -``` - -# OTIS suite - -## Placement-count concentration on `b09` - -The b09 long-format file publishes per (fiscal year \u00d7 placement-count -band \u00d7 gender) counts of individuals in segregation. The callable -expands the banded counts using midpoints and returns Hill-MLE Pareto -exponent, Gini coefficient, mean placements per individual, and the -top-k% concentration share. - -```{r b09} -mrm_otis_placement_concentration(b09) -``` - -The values are computed *within fiscal year*: OTIS -`UniqueIndividual_ID` has format `YYYY-XXXXX-SG` and is randomly -reassigned every fiscal year, so cross-year tracking is invalid by -design. - -## Segregation-duration KM on `b01` - -`NumberConsecutiveDays_Segregation` is the duration in days of each -placement (no censoring \u2014 all durations are observed). The callable -reports the per-stratum mean, median, q25, and the fraction -above the UN Mandela 15-day cutoff. - -```{r b01-duration} -mrm_otis_seg_duration_km(b01) -mrm_otis_seg_duration_km(b01, group_cols = "MentalHealth_Alert") -``` - -This callable replaces the misreading of `YYYY-XXXXX-SG` as a -persistent person identifier, which produces a spurious cross-year -"time-to-readmission" artifact. - -## Mortification co-occurrence (alert columns) - -The three b01 alert flags (`MentalHealth_Alert`, `SuicideRisk_Alert`, -`SuicideWatch_Alert`) co-occur to a degree well above independence. -The substantive figure is `MentalHealth \u00d7 SuicideRisk` Cramer's V. - -```{r mortification} -mrm_otis_mortification_cooccurrence(b01) -``` - -## Region locality - -Ontario provincial seg/RC placement is overwhelmingly -locality-preserving \u2014 over 95% of placements remain within the same -region in the full b01. - -```{r region, eval = FALSE} -# (Region columns are present only in the full b01, not the bundled -# sample; uncomment after morie_load_dataset("otisb01") or -# morie_fetch_tps(...) if needed.) -res <- mrm_otis_region_locality(b01) -print(res$table) -cat("diagonal share:", res$diagonal_share, " V:", res$cramers_v, "\n") -``` - -## Mandela classification - -`mrm_classify_mandela()` shipped in v0.1.14 and remains the canonical -Mandela classifier in v0.2.0. It supports three operationalisations: - -```{r mandela} -mrm_classify_mandela(b01, denominator = "row") # per-placement -mrm_classify_mandela(b01, denominator = "individual_any") # per-person -mrm_classify_mandela(b01, denominator = "individual_cumulative") -``` - -The provincial-canonical 12.5/16.5/20.6 % torture rates from c11 -require the `c11` aggregate (loaded via `morie_sample("otis_c11")`); -see the MRM empirical paper \u00a76. - -# TPS suite - -## Levy-flight Hill exponent on inter-event step lengths - -Treats consecutive events in chronological order as a single stream -and computes the haversine inter-event step length (km). Returns the -Hill-MLE exponent restricted to steps above `min_step_km`. - -```{r tps-levy} -mrm_tps_levy_scaling(tps) -``` - -## Moran's I + DBSCAN clustering - -Grids the WGS84 extent into a coarse raster, counts events per cell, -and computes the global Moran's I via a rook-contiguity matrix. Also -runs DBSCAN on the raw lat/long points (rescaled to km) for cluster -counts. - -```{r tps-moran} -mrm_tps_moran_clustering(tps, grid_resolution = 20L) -``` - -For the high-precision computation on the full 254,378-event Assault -file, use the `morie` Python `tps_spatial_advanced` pipeline; the -R version is for quick interactive auditing. - -## Neighbourhood inter-event recurrence - -For each `HOOD_158` neighbourhood, sorts events chronologically and -computes the gap (in days) between consecutive events. - -```{r tps-recur} -head(mrm_tps_neighbourhood_recurrence_km(tps)) -``` - -## Hawkes manifest loader - -`mrm_tps_load_hawkes_refit(path)` reads -`paper_hawkes_refit.json` (the per-category Hawkes refit table from -the MRM empirical paper \u00a77.1-7.2) and returns it as a tidy -data.frame. Skipped here because the JSON is in the MOIRAIS data -manifest, not bundled with the package. - -# SIU suite - -The SIU callables operate on the SIU.csv file produced by -`morie_fetch_siu()` (an on-demand scraper of the public Director's -Reports). The scraped corpus is not shipped, but the callables -themselves do not depend on shipped data. - -```{r siu, eval = FALSE} -siu_path <- morie_fetch_siu() -siu <- read.csv(siu_path) -res <- mrm_siu_case_to_decision_km(siu) -print(res$pooled) -head(res$by_service[order(-res$by_service$n),]) -mrm_siu_per_service_rate(siu) -mrm_siu_outcome_classifier(siu) -``` - -The verified pooled median in our test snapshot is **120 days from -incident to Director's decision** (n = 1,711 cases). Per-service -medians cluster tightly around 120, indicating a system-wide -processing cadence rather than a per-jurisdiction effect. - -# References - -* MRM theoretical paper \u2014 Ruhela (2026), *MRM: Multilevel - Reconciliation Methodology --- A Multi-Source Statistical - Foundation for Canadian Carceral, Police, and Oversight Data*, - Zenodo - [10.5281/zenodo.20096075](https://doi.org/10.5281/zenodo.20096075). -* MRM empirical paper \u2014 Ruhela (2026), *Solitary Confinement, - Self-Excitation, and Institutional Churn: Empirical Applications - of MRM to Canadian Carceral and Police Data*, Zenodo - [10.5281/zenodo.20175689](https://doi.org/10.5281/zenodo.20175689). -* OTIS data dictionary \u2014 `data.ontario.ca/dataset/data-on-inmates-in-ontario`. -* Toronto Police Open Data \u2014 `data.torontopolice.on.ca/`. -* SIU public Director's Reports \u2014 `siu.on.ca/en/case_directors_reports.php`. diff --git a/r-package/morie/inst/doc/mrm-empirical-callables.html b/r-package/morie/inst/doc/mrm-empirical-callables.html deleted file mode 100644 index 3b8301a60b..0000000000 --- a/r-package/morie/inst/doc/mrm-empirical-callables.html +++ /dev/null @@ -1,632 +0,0 @@ - - - - - - - - - - - - - -MRM empirical callables (OTIS / TPS / SIU) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

This vignette documents the mrm_otis_*(), -mrm_tps_*(), and mrm_siu_*() empirical -callables. Each function is a one-line entry point to a verified -analysis used in the MRM empirical paper (Ruhela 2026, in -preparation). Every example below runs on the small reference samples -bundled with the package, so the vignette is network-free.

-

For the full datasets:

-
    -
  • OTIS 192 morie_load_dataset("otisb01") (downloads via -CKAN on first call; subsequent calls hit the local SQLite cache)
  • -
  • TPS 192 morie_fetch_tps("Assault") (ArcGIS REST)
  • -
  • SIU 192 morie_fetch_siu() (on-demand scrape of public -reports)
  • -
-

See vignette("mrm-dataset-fetchers") for the dataset -side.

-
library(morie)
-b01 <- morie_sample("otis_b01")
-b09 <- morie_sample("otis_b09")
-tps <- morie_sample("tps_assault")
-
-
-

OTIS suite

-
-

Placement-count concentration on b09

-

The b09 long-format file publishes per (fiscal year 0d7 -placement-count band 0d7 gender) counts of individuals in segregation. -The callable expands the banded counts using midpoints and returns -Hill-MLE Pareto exponent, Gini coefficient, mean placements per -individual, and the top-k% concentration share.

-
mrm_otis_placement_concentration(b09)
-#>     year n_individuals n_placements mean_per_individual   gini hill_alpha
-#> 1   2023         12647        55421            4.382146 0.5331     2.0174
-#> 2   2024         10881        47123            4.330760 0.5862     2.1599
-#> 3   2025          9608        46893            4.880620 0.6057     2.0880
-#> 4 pooled         33136       149437            4.509808 0.5748     2.0814
-#>   top_pct_share
-#> 1        0.2932
-#> 2        0.3351
-#> 3        0.3215
-#> 4        0.3180
-

The values are computed within fiscal year: OTIS -UniqueIndividual_ID has format YYYY-XXXXX-SG -and is randomly reassigned every fiscal year, so cross-year tracking is -invalid by design.

-
-
-

Segregation-duration KM on b01

-

NumberConsecutiveDays_Segregation is the duration in -days of each placement (no censoring 014 all durations are observed). -The callable reports the per-stratum mean, median, q25, and the fraction -above the UN Mandela 15-day cutoff.

-
mrm_otis_seg_duration_km(b01)
-#>   stratum    n mean_days median_days q25_days pct_above_mandela
-#> 1  pooled 1000      3.03           2        3               0.7
-#>   median_among_above_mandela
-#> 1                         54
-mrm_otis_seg_duration_km(b01, group_cols = "MentalHealth_Alert")
-#>   stratum   n mean_days median_days q25_days pct_above_mandela
-#> 1      No 499      2.76           2        3               0.6
-#> 2     Yes 501      3.29           2        4               0.8
-#>   median_among_above_mandela
-#> 1                       54.0
-#> 2                       41.5
-

This callable replaces the misreading of YYYY-XXXXX-SG -as a persistent person identifier, which produces a spurious cross-year -“time-to-readmission” artifact.

-
-
-

Mortification co-occurrence (alert columns)

-

The three b01 alert flags (MentalHealth_Alert, -SuicideRisk_Alert, SuicideWatch_Alert) -co-occur to a degree well above independence. The substantive figure is -MentalHealth \u00d7 SuicideRisk Cramer’s V.

-
mrm_otis_mortification_cooccurrence(b01)
-#>              alert_a            alert_b    n   chi2 df   p_value cramers_v
-#> 1 MentalHealth_Alert  SuicideRisk_Alert 1000  33.25  1  8.12e-09    0.1823
-#> 2 MentalHealth_Alert SuicideWatch_Alert 1000  12.09  1  5.08e-04    0.1099
-#> 3  SuicideRisk_Alert SuicideWatch_Alert 1000 470.37  1 2.66e-104    0.6858
-
-
-

Region locality

-

Ontario provincial seg/RC placement is overwhelmingly -locality-preserving 014 over 95% of placements remain within the same -region in the full b01.

-
# (Region columns are present only in the full b01, not the bundled
-# sample; uncomment after morie_load_dataset("otisb01") or
-# morie_fetch_tps(...) if needed.)
-res <- mrm_otis_region_locality(b01)
-print(res$table)
-cat("diagonal share:", res$diagonal_share, "  V:", res$cramers_v, "\n")
-
-
-

Mandela classification

-

mrm_classify_mandela() shipped in v0.1.14 and remains -the canonical Mandela classifier in v0.2.0. It supports three -operationalisations:

-
mrm_classify_mandela(b01, denominator = "row")           # per-placement
-#>     year denominator n_mandela        rate  pct n_broader_rc rate_broader
-#> 1   2023         362         0 0.000000000 0.00            0  0.000000000
-#> 2   2024         337         3 0.008902077 0.89            3  0.008902077
-#> 3   2025         301         4 0.013289037 1.33            4  0.013289037
-#> 4 pooled        1000         7 0.007000000 0.70            7  0.007000000
-mrm_classify_mandela(b01, denominator = "individual_any") # per-person
-#>     year denominator n_mandela        rate  pct n_broader_rc rate_broader
-#> 1   2023         354         0 0.000000000 0.00            0  0.000000000
-#> 2   2024         329         3 0.009118541 0.91            3  0.009118541
-#> 3   2025         289         4 0.013840830 1.38            4  0.013840830
-#> 4 pooled         972         7 0.007201646 0.72            7  0.007201646
-mrm_classify_mandela(b01, denominator = "individual_cumulative")
-#>     year denominator n_mandela        rate  pct n_broader_rc rate_broader
-#> 1   2023         354         0 0.000000000 0.00            0  0.000000000
-#> 2   2024         329         3 0.009118541 0.91            3  0.009118541
-#> 3   2025         289         5 0.017301038 1.73            5  0.017301038
-#> 4 pooled         972         8 0.008230453 0.82            8  0.008230453
-

The provincial-canonical 12.5/16.5/20.6 % torture rates from c11 -require the c11 aggregate (loaded via -morie_sample("otis_c11")); see the MRM empirical paper -0a76.

-
-
-
-

TPS suite

-
-

Levy-flight Hill exponent on inter-event step lengths

-

Treats consecutive events in chronological order as a single stream -and computes the haversine inter-event step length (km). Returns the -Hill-MLE exponent restricted to steps above -min_step_km.

-
mrm_tps_levy_scaling(tps)
-#> $n_events
-#> [1] 1000
-#> 
-#> $n_steps_tail
-#> [1] 995
-#> 
-#> $min_step_km
-#> [1] 0.5
-#> 
-#> $hill_alpha
-#> [1] 1.3043
-
-
-

Moran’s I + DBSCAN clustering

-

Grids the WGS84 extent into a coarse raster, counts events per cell, -and computes the global Moran’s I via a rook-contiguity matrix. Also -runs DBSCAN on the raw lat/long points (rescaled to km) for cluster -counts.

-
mrm_tps_moran_clustering(tps, grid_resolution = 20L)
-#> $morans_I
-#> [1] -0.000138
-#> 
-#> $morans_z
-#> [1] 0.67
-#> 
-#> $dbscan_n_clusters
-#> [1] 21
-#> 
-#> $dbscan_n_noise
-#> [1] 730
-#> 
-#> $dbscan_largest
-#> [1] 82
-

For the high-precision computation on the full 254,378-event Assault -file, use the morie Python -tps_spatial_advanced pipeline; the R version is for quick -interactive auditing.

-
-
-

Neighbourhood inter-event recurrence

-

For each HOOD_158 neighbourhood, sorts events -chronologically and computes the gap (in days) between consecutive -events.

-
head(mrm_tps_neighbourhood_recurrence_km(tps))
-#>   hood n_events n_gaps mean_gap_days median_gap_days p25_gap_days p75_gap_days
-#> 1  001       17     16        267.06           213.5       101.00       356.75
-#> 2  002       12     11        447.91           360.0        74.00       606.00
-#> 3  003        5      4        887.75           869.0       557.75      1199.00
-#> 4  004        3      2       1781.00          1781.0      1065.50      2496.50
-#> 5  005        4      3        776.00           676.0       537.00       965.00
-#> 6  006        5      4        845.00           731.0       289.50      1286.50
-
-
-

Hawkes manifest loader

-

mrm_tps_load_hawkes_refit(path) reads -paper_hawkes_refit.json (the per-category Hawkes refit -table from the MRM empirical paper 0a77.1-7.2) and returns it as a tidy -data.frame. Skipped here because the JSON is in the MOIRAIS data -manifest, not bundled with the package.

-
-
-
-

SIU suite

-

The SIU callables operate on the SIU.csv file produced by -morie_fetch_siu() (an on-demand scraper of the public -Director’s Reports). The scraped corpus is not shipped, but the -callables themselves do not depend on shipped data.

-
siu_path <- morie_fetch_siu()
-siu <- read.csv(siu_path)
-res <- mrm_siu_case_to_decision_km(siu)
-print(res$pooled)
-head(res$by_service[order(-res$by_service$n),])
-mrm_siu_per_service_rate(siu)
-mrm_siu_outcome_classifier(siu)
-

The verified pooled median in our test snapshot is 120 days -from incident to Director’s decision (n = 1,711 cases). -Per-service medians cluster tightly around 120, indicating a system-wide -processing cadence rather than a per-jurisdiction effect.

-
-
-

References

-
    -
  • MRM theoretical paper 014 Ruhela (2026), MRM: Multilevel -Reconciliation Methodology — A Multi-Source Statistical Foundation for -Canadian Carceral, Police, and Oversight Data, Zenodo 10.5281/zenodo.20096075.
  • -
  • MRM empirical paper 014 Ruhela (2026), Solitary Confinement, -Self-Excitation, and Institutional Churn: Empirical Applications of MRM -to Canadian Carceral and Police Data, Zenodo 10.5281/zenodo.20175689.
  • -
  • OTIS data dictionary 014 -data.ontario.ca/dataset/data-on-inmates-in-ontario.
  • -
  • Toronto Police Open Data 014 -data.torontopolice.on.ca/.
  • -
  • SIU public Director’s Reports 014 -siu.on.ca/en/case_directors_reports.php.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/mrm-otis-walkthrough.Rmd b/r-package/morie/inst/doc/mrm-otis-walkthrough.Rmd deleted file mode 100644 index 402754469b..0000000000 --- a/r-package/morie/inst/doc/mrm-otis-walkthrough.Rmd +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: "MRM walkthrough on OTIS provincial data" -output: - rmarkdown::html_vignette: - toc: true -vignette: > - %\VignetteIndexEntry{MRM walkthrough on OTIS provincial data} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - eval = requireNamespace("morie", quietly = TRUE) -) -``` - -# What this vignette covers - -The **MRM (Multilevel Reconciliation Methodology;** people-credit -reading: McNamara--Ruhela--Medina**)** framework is a coordinated set -of ten causal estimators paired with a multi-source data layer for -Canadian carceral, police, and oversight data. This vignette uses the -provincial Offender Tracking Information System (OTIS; published by -the Ontario Ministry of the Solicitor General) -restrictive-confinement microdata as an example, applies the -ten-estimator ensemble to a binary-treatment design on dataset `a01`, -and shows how to read the resulting summary. - -The mathematical foundations are developed in the companion paper -(Ruhela 2026, *The MRM Framework*, Zenodo -[10.5281/zenodo.20096075](https://doi.org/10.5281/zenodo.20096075)). - -# Loading OTIS - -OTIS is shipped with the package; the `morie_load_dataset()` loader -hides the SQLite-backed indirection. - -```{r load-otis, eval = FALSE} -library(morie) -otis <- morie_load_dataset("otis-2025-a01") -str(otis) -``` - -# The canonical a01 design - -For dataset `a01` the canonical formulation is -`T_high_ac` (a binary treatment derived from administrative-classification -flags) on `Y_vm_count` (a count of a specific in-confinement -observation) with the standard demographic covariate set. This is the -design choice that the per-row MRM modules implement. - -```{r design, eval = FALSE} -# Full ten-estimator ensemble on the canonical a01 design: -result <- estimate_ate( - data = otis, - outcome = "Y_vm_count", - treatment = "T_high_ac", - covariates = c("age", "sex", "region", "fiscal_year") -) -print(result) -``` - -The returned object summarises the IPW (Hajek), AIPW -(Robins--Rotnitzky--Zhao), g-computation, propensity-score-matching -(1:1 NN and five-strata subclass), IRM-DML -(Chernozhukov *et al.* 2018), PLR-DML, and SuperLearner-stacked AIPW -estimates. Multi-SE comparison (pooled, cluster on fiscal year, cluster -on individual ID, two-way) is reported alongside the IRM-DML primary. - -# Augmented IPW - -```{r aipw, eval = FALSE} -result_aipw <- estimate_aipw( - data = otis, - outcome = "Y_vm_count", - treatment = "T_high_ac", - covariates = c("age", "sex", "region", "fiscal_year") -) -print(result_aipw) -``` - -# Aggregate companion: incidence-rate ratios - -For aggregate (year-level) outcomes the analog is a Poisson or -negative-binomial GLM with cluster-robust standard errors. The MRM -framework reports both the per-row individual-level estimate (above) -and the aggregate IRR family in parallel; see the companion paper for -the formal aggregate-IRR notation. - -# Mandela classification - -A separate Mandela-Rules classifier (UN Mandela Rules 43 and 44) is -applied at both the federal and provincial levels. The provincial -implementation uses a duration-only proxy that is documented -explicitly in the framework paper. Federal counterpart analyses (SIU -IAP, Sprott--Doob--Iftene) live in the companion Python module -`morie.tps_csi` and `morie.siu_iap`. - -# Where to go next - -- The full MRM framework paper, including all ten estimators, - multi-SE comparison, propensity calibration, and the - Sprott--Doob--Iftene replication tables, is at Zenodo - [10.5281/zenodo.20096075](https://doi.org/10.5281/zenodo.20096075). -- The MORIE package paper covers the broader toolkit: - [10.5281/zenodo.20096350](https://doi.org/10.5281/zenodo.20096350). -- Citation: see `citation("morie")`. diff --git a/r-package/morie/inst/doc/mrm-otis-walkthrough.html b/r-package/morie/inst/doc/mrm-otis-walkthrough.html deleted file mode 100644 index a399ffc793..0000000000 --- a/r-package/morie/inst/doc/mrm-otis-walkthrough.html +++ /dev/null @@ -1,487 +0,0 @@ - - - - - - - - - - - - - -MRM walkthrough on OTIS provincial data - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

What this vignette covers

-

The MRM (Multilevel Reconciliation Methodology; -people-credit reading: McNamara–Ruhela–Medina) -framework is a coordinated set of ten causal estimators paired with a -multi-source data layer for Canadian carceral, police, and oversight -data. This vignette uses the provincial Offender Tracking Information -System (OTIS; published by the Ontario Ministry of the Solicitor -General) restrictive-confinement microdata as an example, applies the -ten-estimator ensemble to a binary-treatment design on dataset -a01, and shows how to read the resulting summary.

-

The mathematical foundations are developed in the companion paper -(Ruhela 2026, The MRM Framework, Zenodo 10.5281/zenodo.20096075).

-
-
-

Loading OTIS

-

OTIS is shipped with the package; the -morie_load_dataset() loader hides the SQLite-backed -indirection.

-
library(morie)
-otis <- morie_load_dataset("otis-2025-a01")
-str(otis)
-
-
-

The canonical a01 design

-

For dataset a01 the canonical formulation is -T_high_ac (a binary treatment derived from -administrative-classification flags) on Y_vm_count (a count -of a specific in-confinement observation) with the standard demographic -covariate set. This is the design choice that the per-row MRM modules -implement.

-
# Full ten-estimator ensemble on the canonical a01 design:
-result <- estimate_ate(
-  data       = otis,
-  outcome    = "Y_vm_count",
-  treatment  = "T_high_ac",
-  covariates = c("age", "sex", "region", "fiscal_year")
-)
-print(result)
-

The returned object summarises the IPW (Hajek), AIPW -(Robins–Rotnitzky–Zhao), g-computation, propensity-score-matching (1:1 -NN and five-strata subclass), IRM-DML (Chernozhukov et al. -2018), PLR-DML, and SuperLearner-stacked AIPW estimates. Multi-SE -comparison (pooled, cluster on fiscal year, cluster on individual ID, -two-way) is reported alongside the IRM-DML primary.

-
-
-

Augmented IPW

-
result_aipw <- estimate_aipw(
-  data       = otis,
-  outcome    = "Y_vm_count",
-  treatment  = "T_high_ac",
-  covariates = c("age", "sex", "region", "fiscal_year")
-)
-print(result_aipw)
-
-
-

Aggregate companion: incidence-rate ratios

-

For aggregate (year-level) outcomes the analog is a Poisson or -negative-binomial GLM with cluster-robust standard errors. The MRM -framework reports both the per-row individual-level estimate (above) and -the aggregate IRR family in parallel; see the companion paper for the -formal aggregate-IRR notation.

-
-
-

Mandela classification

-

A separate Mandela-Rules classifier (UN Mandela Rules 43 and 44) is -applied at both the federal and provincial levels. The provincial -implementation uses a duration-only proxy that is documented explicitly -in the framework paper. Federal counterpart analyses (SIU IAP, -Sprott–Doob–Iftene) live in the companion Python module -morie.tps_csi and morie.siu_iap.

-
-
-

Where to go next

-
    -
  • The full MRM framework paper, including all ten estimators, multi-SE -comparison, propensity calibration, and the Sprott–Doob–Iftene -replication tables, is at Zenodo 10.5281/zenodo.20096075.
  • -
  • The MORIE package paper covers the broader toolkit: 10.5281/zenodo.20096350.
  • -
  • Citation: see citation("morie").
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/signal-processing.html b/r-package/morie/inst/doc/signal-processing.html deleted file mode 100644 index 1eb49aaa6f..0000000000 --- a/r-package/morie/inst/doc/signal-processing.html +++ /dev/null @@ -1,450 +0,0 @@ - - - - - - - - - - - - - -Signal processing primitives - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

Beyond its causal-inference and survey-statistics surfaces, MORIE -exposes a small collection of signal-processing primitives. These are -useful for forensic-audio work, biomedical signals, and any analysis -that needs spectral or time-frequency methods adjacent to the causal -pipeline (e.g.\u00a0inter-incident time series in criminological data).

-

The R surface is intentionally thin \u2014 the heavy lifting lives in the -Python morie.signal_processing and -morie.homomorphic_deconvolution modules. The R wrappers -cover the most commonly needed primitives.

-
-
-

A synthetic two-tone signal

-
library(morie)
-
-set.seed(42)
-fs <- 1000                           # sampling rate (Hz)
-t  <- seq(0, 1, by = 1 / fs)
-sig <- sin(2 * pi * 50 * t) +
-       0.5 * sin(2 * pi * 120 * t) +
-       0.3 * rnorm(length(t))
-
-
-

Basic FFT

-
spec <- stats::fft(sig)
-n    <- length(sig)
-freq <- (0:(n / 2 - 1)) * fs / n
-mag  <- Mod(spec)[1:(n / 2)]
-peak_freqs <- freq[order(mag, decreasing = TRUE)[1:5]]
-peak_freqs
-#> [1]  49.95005 119.88012 120.87912  50.94905  48.95105
-

The two largest peaks should sit near 50 Hz and 120 Hz, recovering -the synthetic signal\u2019s components.

-
-
-

Where to go next

-
    -
  • Spectral analysis with windowing, multitaper estimation, and -short-time Fourier transforms is implemented in the Python -morie.signal_processing module \u2014 with the same RichResult -return convention \u2014 and is documented in the package paper.
  • -
  • Homomorphic deconvolution and cepstral methods (used in forensic -audio and biomedical-signal applications) live in -morie.homomorphic_deconvolution.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/doc/survey-weighted.html b/r-package/morie/inst/doc/survey-weighted.html deleted file mode 100644 index 9aa93104c7..0000000000 --- a/r-package/morie/inst/doc/survey-weighted.html +++ /dev/null @@ -1,496 +0,0 @@ - - - - - - - - - - - - - -Survey-weighted estimation with MORIE - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Overview

-

Many MORIE analyses use Statistics Canada Public Use Microdata Files -(CCS, CSADS, CSUS, CADS, CPADS), which require complex-sample design -weights for valid inference. This vignette covers the sampling and -weighting helpers that wrap the same machinery.

-
-
-

Stratified, cluster, and PPS sampling

-
library(morie)
-set.seed(11)
-
-n <- 1000
-df <- data.frame(
-  id     = seq_len(n),
-  region = sample(c("ON", "QC", "BC", "AB"), n, replace = TRUE),
-  age    = round(rnorm(n, 40, 12)),
-  weight = runif(n, 0.5, 2.0)
-)
-
-# Stratified random sample within region.
-strat <- stratified_sample(df, strata_col = "region", n_per_stratum = 50)
-
-# Single-stage cluster sample on region (all units in sampled
-# clusters are kept).
-clust <- cluster_sample(df, cluster_col = "region", n_clusters = 2)
-
-dim(strat)
-#> [1] 200   5
-dim(clust)
-#> [1] 505   5
-
-
-

Calibration weights and design weights

-

compute_design_weights() and -calibration_weights() produce weights consistent with a -known sampling scheme and known population marginals.

-
# Design weights from a stratified scheme.
-# Treat `region` as the stratum; supply known population sizes per stratum.
-pop_sizes <- c(ON = 14000000, QC = 8500000, BC = 5200000, AB = 4400000)
-design <- compute_design_weights(
-  df,
-  strata_col       = "region",
-  population_sizes = pop_sizes
-)
-head(design)
-#> [1] 34000.00 34000.00 18107.00 55555.56 18107.00 55555.56
-
-
-

Effective sample size and design effect

-
# Effective sample size given a weight vector.
-ess <- effective_sample_size(df$weight)
-ess
-#> [1] 897.9353
-
-# Design effect: how much variance inflates from the weighting.
-deff <- design_effect(df$weight)
-deff
-#> [1] 1.113666
-

The effective sample size answers the practical question \u201chow many -independent observations does this weighted sample contain?\u201d The design -effect translates between unweighted-equivalent and complex-sample -variances.

-
-
-

Bootstrap variance with weights

-
# Bootstrap a weighted statistic --- here, the weighted mean of `age`.
-boot <- bootstrap_sample(
-  df,
-  statistic   = function(x) stats::weighted.mean(x$age, x$weight),
-  n_bootstrap = 100
-)
-str(boot, max.level = 1)
-#> List of 5
-#>  $ estimate    : num 39.9
-#>  $ se          : num 0.425
-#>  $ ci_lower    : Named num 39.1
-#>   ..- attr(*, "names")= chr "2.5%"
-#>  $ ci_upper    : Named num 40.6
-#>   ..- attr(*, "names")= chr "97.5%"
-#>  $ distribution: num [1:100] 40.3 40 39.1 39.9 39.6 ...
-
-
-

Where to go next

-
    -
  • For complex-survey causal inference (survey-weighted ATE/AIPW), see -the causal-inference vignette \u2014 the same -estimate_* functions accept a weights -argument.
  • -
  • For Statistics Canada PUMF acknowledgments and citation -requirements, see the package README.
  • -
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/r-package/morie/inst/extdata/siu_canonical_overrides.csv.gz b/r-package/morie/inst/extdata/siu_canonical_overrides.csv.gz new file mode 100644 index 0000000000..98afaee765 Binary files /dev/null and b/r-package/morie/inst/extdata/siu_canonical_overrides.csv.gz differ diff --git a/r-package/morie/inst/extdata/siu_drid_manifest.csv.gz b/r-package/morie/inst/extdata/siu_drid_manifest.csv.gz new file mode 100644 index 0000000000..c7bb60f38d Binary files /dev/null and b/r-package/morie/inst/extdata/siu_drid_manifest.csv.gz differ diff --git a/r-package/morie/man/agset.Rd b/r-package/morie/man/agset.Rd index 32b4a7899a..36df5c1926 100644 --- a/r-package/morie/man/agset.Rd +++ b/r-package/morie/man/agset.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/agset.R \name{agset} \alias{agset} -\alias{agenda_setter_power} +\alias{morie_agenda_setter_power} \title{Romer-Rosenthal agenda-setter outcome and power (Armstrong Ch 10)} \usage{ agset(options, setter_ideal, reversion) -agenda_setter_power(options, setter_ideal, reversion) +morie_agenda_setter_power(options, setter_ideal, reversion) } \arguments{ \item{options}{Discrete set of feasible policy proposals.} @@ -26,9 +26,7 @@ legislature accepts iff the proposal beats the reversion at the median voter. Power = distance moved from the status quo. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \keyword{internal} diff --git a/r-package/morie/man/algnm.Rd b/r-package/morie/man/algnm.Rd index 18db2a6e5f..d694dc669b 100644 --- a/r-package/morie/man/algnm.Rd +++ b/r-package/morie/man/algnm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/algnm.R \name{algnm} \alias{algnm} -\alias{party_alignment} +\alias{morie_party_alignment} \title{Rice party cohesion index (Rice 1928; Armstrong Ch 8)} \usage{ algnm(x, party = NULL) -party_alignment(x, party = NULL) +morie_party_alignment(x, party = NULL) } \arguments{ \item{x}{Either a 0/1 vote vector for one party, or a numeric @@ -24,9 +24,6 @@ Rice_p = |\%yea_p - \%nay_p| over a series of roll calls; with a \code{party} indicator, averages per-roll-call Rice over each party. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +algnm(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/anova_one_way.Rd b/r-package/morie/man/anova_one_way.Rd deleted file mode 100644 index 14eb8bca53..0000000000 --- a/r-package/morie/man/anova_one_way.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by morie generate_rd.py -\name{anova_one_way} -\alias{anova_one_way} -\title{One-way ANOVA} -\description{ - One-way ANOVA -} -\usage{ - anova_one_way(...) -} -\arguments{ - \item{...}{Numeric vectors, one per group.} -} -\value{ - Named list: 'F', 'df_between', 'df_within', 'p_value', -} -\examples{ - anova_one_way(rnorm(30, 0), rnorm(30, 0.5), rnorm(30, 1)) -} diff --git a/r-package/morie/man/antth.Rd b/r-package/morie/man/antth.Rd index 13c17ad54c..63e6163e88 100644 --- a/r-package/morie/man/antth.Rd +++ b/r-package/morie/man/antth.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/antth.R \name{antth} \alias{antth} -\alias{antithetic_variates} +\alias{morie_antithetic_variates} \title{Antithetic variates (Hammersley & Morton 1956)} \usage{ antth(x = NULL, f = NULL, N = 1000L, seed = 42L) -antithetic_variates(x = NULL, f = NULL, N = 1000L, seed = 42L) +morie_antithetic_variates(x = NULL, f = NULL, N = 1000L, seed = 42L) } \arguments{ \item{x}{optional U(0,1) sample; if NULL, draw N points.} diff --git a/r-package/morie/man/ask_percy.Rd b/r-package/morie/man/ask_percy.Rd deleted file mode 100644 index c76b93407b..0000000000 --- a/r-package/morie/man/ask_percy.Rd +++ /dev/null @@ -1,29 +0,0 @@ -\name{ask_percy} -\alias{ask_percy} -\title{Ask the Perseus assistant a question} -\description{ - Send a free-form question to the Perseus assistant via the bundled - Python LLM bridge. Returns the assistant's text response. -} -\usage{ - ask_percy(question, context = NULL, - python_bin = Sys.getenv("MORIE_PYTHON_BIN", "python3")) -} -\arguments{ - \item{question}{Character string. The natural-language question.} - \item{context}{Optional named list of context variables (data summaries, - column descriptions, etc.) appended to the prompt.} - \item{python_bin}{Path to the Python interpreter that has the - \pkg{morie} Python package installed. Defaults to - \code{$MORIE_PYTHON_BIN} or \code{python3}.} -} -\value{ - Character string containing the assistant's response. -} -\keyword{internal} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/audit_public_outputs.Rd b/r-package/morie/man/audit_public_outputs.Rd deleted file mode 100644 index cc7cd30115..0000000000 --- a/r-package/morie/man/audit_public_outputs.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by morie generate_rd.py -\name{audit_public_outputs} -\alias{audit_public_outputs} -\title{Audit declared outputs against files on disk} -\description{ - Audit declared outputs against files on disk -} -\usage{ - audit_public_outputs(project_root, manifest) -} -\arguments{ - \item{project_root}{Project root directory.} - \item{manifest}{Manifest data frame. If 'NULL', loaded from disk.} -} -\value{ - Data frame containing declared and observed output status. -} -\examples{ -\dontrun{ - # Walks the project's outputs_manifest.csv and confirms every - # declared public artefact exists on disk. - manifest <- read_outputs_manifest(project_root = ".") - audit <- audit_public_outputs(project_root = ".", - manifest = manifest) - summarize_output_audit(audit) -} -} diff --git a/r-package/morie/man/bootstrap_sample.Rd b/r-package/morie/man/bootstrap_sample.Rd deleted file mode 100644 index ec0653b891..0000000000 --- a/r-package/morie/man/bootstrap_sample.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{bootstrap_sample} -\alias{bootstrap_sample} -\title{Bootstrap variance for an arbitrary statistic} -\description{ - Bootstrap resampling for any statistic -} -\usage{ - bootstrap_sample(df, statistic, n_bootstrap, seed) -} -\arguments{ - \item{df}{A data frame.} - \item{statistic}{A function taking a data frame and returning a scalar.} - \item{n_bootstrap}{Number of bootstrap replicates.} - \item{seed}{Random seed.} -} -\value{ - Named list: 'estimate', 'se', 'ci_lower', 'ci_upper', -} -\examples{ - df <- data.frame(x = rnorm(100)) - bootstrap_sample(df, statistic = function(d) mean(d$x)) -} diff --git a/r-package/morie/man/brdgr.Rd b/r-package/morie/man/brdgr.Rd index 4bdbfa68ae..c0b5e602f1 100644 --- a/r-package/morie/man/brdgr.Rd +++ b/r-package/morie/man/brdgr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/brdgr.R \name{brdgr} \alias{brdgr} -\alias{bridge_observations} +\alias{morie_bridge_observations} \title{Bridge observations across sessions / chambers (Armstrong Ch 6)} \usage{ brdgr(x, y = NULL) -bridge_observations(x, y = NULL) +morie_bridge_observations(x, y = NULL) } \arguments{ \item{x}{Vector of session-1 IDs or session-1 vote matrix.} @@ -23,9 +23,6 @@ Counts legislators appearing in both sessions. With ID vectors: returns set intersection. With matrices: rows non-empty in both. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +brdgr(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/btsrp.Rd b/r-package/morie/man/btsrp.Rd index f38643231c..1dc84cd660 100644 --- a/r-package/morie/man/btsrp.Rd +++ b/r-package/morie/man/btsrp.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/btsrp.R \name{btsrp} \alias{btsrp} -\alias{bootstrap_ci} +\alias{morie_bootstrap_ci} \title{Bootstrap confidence interval (percentile, BCa, studentized)} \usage{ btsrp( @@ -14,7 +14,7 @@ btsrp( seed = 42L ) -bootstrap_ci( +morie_bootstrap_ci( x, statistic = NULL, B = 2000L, @@ -40,7 +40,7 @@ bootstrap_ci( Named list with estimate, se, ci_lower, ci_upper, alpha, B, n, method. } \description{ -R parity of \code{morie.fn.btsrp.bootstrap_ci}. Three methods are +R parity of \code{morie.fn.btsrp.morie_bootstrap_ci}. Three methods are supported: percentile, BCa (Efron 1987 JASA), and studentized (Hall 1988 nested resampling). } diff --git a/r-package/morie/man/build_assistant_prompt.Rd b/r-package/morie/man/build_assistant_prompt.Rd deleted file mode 100644 index ee02aa9469..0000000000 --- a/r-package/morie/man/build_assistant_prompt.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{build_assistant_prompt} -\alias{build_assistant_prompt} -\title{Build an MORIE assistant prompt} -\description{ - Build an MORIE assistant prompt -} -\usage{ - build_assistant_prompt(question, context) -} -\arguments{ - \item{question}{User question.} - \item{context}{Optional context string.} -} -\value{ - Character scalar prompt. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/build_outputs_manifest.Rd b/r-package/morie/man/build_outputs_manifest.Rd deleted file mode 100644 index b998d5b693..0000000000 --- a/r-package/morie/man/build_outputs_manifest.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by morie generate_rd.py -\name{build_outputs_manifest} -\alias{build_outputs_manifest} -\title{Build an outputs manifest from a directory of artifacts} -\description{ - Build an outputs manifest from a directory of artifacts -} -\usage{ - build_outputs_manifest(output_dir, manifest_path, public_prefix, extensions) -} -\arguments{ - \item{output_dir}{Directory containing output files.} - \item{manifest_path}{CSV path to write.} - \item{public_prefix}{Prefix used in 'public_path' values.} - \item{extensions}{File extensions to include (without dots).} -} -\value{ - Manifest data frame. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/build_prompt.Rd b/r-package/morie/man/build_prompt.Rd deleted file mode 100644 index 45b553a526..0000000000 --- a/r-package/morie/man/build_prompt.Rd +++ /dev/null @@ -1,24 +0,0 @@ -\name{build_prompt} -\alias{build_prompt} -\title{Build a structured prompt for the Perseus assistant} -\description{ - Compose a question and optional context into the structured prompt - format expected by \code{\link{ask_percy}}. -} -\usage{ - build_prompt(question, context = NULL) -} -\arguments{ - \item{question}{Character string. The natural-language question.} - \item{context}{Optional named list of context variables.} -} -\value{ - Character string. The composed prompt. -} -\keyword{internal} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/buttbp.Rd b/r-package/morie/man/buttbp.Rd index 0b0079fc31..7a709ae9ea 100644 --- a/r-package/morie/man/buttbp.Rd +++ b/r-package/morie/man/buttbp.Rd @@ -31,7 +31,7 @@ if (requireNamespace("signal", quietly = TRUE)) { t <- seq(0, 1, length.out = 1000) # 2 Hz drift + 10 Hz band of interest + 60 Hz noise x <- sin(2 * pi * 2 * t) + sin(2 * pi * 10 * t) + - 0.3 * sin(2 * pi * 60 * t) + 0.3 * sin(2 * pi * 60 * t) y <- buttbp(x, fs = 1000, low = 5, high = 20) length(y$filtered) } diff --git a/r-package/morie/man/buttbs.Rd b/r-package/morie/man/buttbs.Rd index 416f0afe70..be933a7ccf 100644 --- a/r-package/morie/man/buttbs.Rd +++ b/r-package/morie/man/buttbs.Rd @@ -30,7 +30,7 @@ if (requireNamespace("signal", quietly = TRUE)) { set.seed(1) t <- seq(0, 1, length.out = 1000) x <- sin(2 * pi * 10 * t) + sin(2 * pi * 60 * t) - y <- buttbs(x, fs = 1000) # remove 60 Hz mains + y <- buttbs(x, fs = 1000) # remove 60 Hz mains length(y$filtered) } } diff --git a/r-package/morie/man/butthp.Rd b/r-package/morie/man/butthp.Rd index f0bb9c39fb..f9a27c075d 100644 --- a/r-package/morie/man/butthp.Rd +++ b/r-package/morie/man/butthp.Rd @@ -28,7 +28,7 @@ signals (EEG, ECG) prior to analysis. if (requireNamespace("signal", quietly = TRUE)) { set.seed(1) t <- seq(0, 1, length.out = 500) - x <- 5 * t + sin(2 * pi * 10 * t) # linear drift + 10 Hz signal + x <- 5 * t + sin(2 * pi * 10 * t) # linear drift + 10 Hz signal y <- butthp(x, fs = 500, cutoff = 1) length(y$filtered) } diff --git a/r-package/morie/man/buttlp.Rd b/r-package/morie/man/buttlp.Rd index f867c71883..3810259aa3 100644 --- a/r-package/morie/man/buttlp.Rd +++ b/r-package/morie/man/buttlp.Rd @@ -27,10 +27,10 @@ biological or geophysical time series. \donttest{ if (requireNamespace("signal", quietly = TRUE)) { set.seed(1) - t <- seq(0, 1, length.out = 500) - x <- sin(2 * pi * 5 * t) + 0.5 * sin(2 * pi * 60 * t) # 5 Hz + 60 Hz - y <- buttlp(x, fs = 500, cutoff = 20) - length(y$filtered) # 500 + t <- seq(0, 1, length.out = 500) + x <- sin(2 * pi * 5 * t) + 0.5 * sin(2 * pi * 60 * t) # 5 Hz + 60 Hz + y <- buttlp(x, fs = 500, cutoff = 20) + length(y$filtered) # 500 } } } diff --git a/r-package/morie/man/bysid.Rd b/r-package/morie/man/bysid.Rd index c9b46350fb..f1259f2e44 100644 --- a/r-package/morie/man/bysid.Rd +++ b/r-package/morie/man/bysid.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/bysid.R \name{bysid} \alias{bysid} -\alias{bayesian_ideal_points} +\alias{morie_bayesian_ideal_points} \title{Bayesian ideal-point estimation (Armstrong Ch 5)} \usage{ bysid(x, n_iter = 400L, burn = 100L, seed = 0L, deterministic_seed = NULL) -bayesian_ideal_points( +morie_bayesian_ideal_points( x, n_iter = 400L, burn = 100L, @@ -38,9 +38,6 @@ Metropolis-within-Gibbs surrogate for Clinton-Jackman-Rivers (2004) with N(0,1) prior on x and N(0, 5) priors on (alpha, beta). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +bysid(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/calibration_weights.Rd b/r-package/morie/man/calibration_weights.Rd deleted file mode 100644 index 073e7f2ad3..0000000000 --- a/r-package/morie/man/calibration_weights.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by morie generate_rd.py -\name{calibration_weights} -\alias{calibration_weights} -\title{Calibration weights via iterative proportional fitting (raking)} -\description{ - Calibration weights via iterative proportional fitting (raking) - -Adjusts initial design weights so that weighted marginal totals match -known population totals for each auxiliary variable. -} -\usage{ - calibration_weights(df, aux_vars, population_totals, initial_weights, max_iter, tol) -} -\arguments{ - \item{df}{A data frame.} - \item{aux_vars}{Character vector of categorical auxiliary variable names.} - \item{population_totals}{Named list: '"var_level"' -> population count.} - \item{initial_weights}{Optional numeric vector of starting weights.} - \item{max_iter}{Maximum IPF iterations.} - \item{tol}{Convergence tolerance.} -} -\value{ - Numeric vector of calibrated weights. -} -\examples{ -set.seed(2026) -n <- 100L -df <- data.frame( - age_group = sample(c("18-24", "25-34", "35+"), n, replace = TRUE), - gender = sample(c("F", "M"), n, replace = TRUE) -) -# Suppose the true population proportions are known: -totals <- list(age_group = c("18-24" = 1000, "25-34" = 2000, "35+" = 3000), - gender = c("F" = 3000, "M" = 3000)) -w <- calibration_weights(df, - aux_vars = c("age_group", "gender"), - population_totals = totals) -summary(w) -} diff --git a/r-package/morie/man/canonicalize_cpads_data.Rd b/r-package/morie/man/canonicalize_cpads_data.Rd deleted file mode 100644 index 79ae09aa99..0000000000 --- a/r-package/morie/man/canonicalize_cpads_data.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{canonicalize_cpads_data} -\alias{canonicalize_cpads_data} -\title{Canonicalize raw CPADS PUMF columns} -\description{ - Canonicalize raw CPADS PUMF columns -} -\usage{ - canonicalize_cpads_data(data) -} -\arguments{ - \item{data}{Raw CPADS data frame.} -} -\value{ - Data frame with canonical MORIE analysis columns. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/chi_square_test.Rd b/r-package/morie/man/chi_square_test.Rd deleted file mode 100644 index 53772156f0..0000000000 --- a/r-package/morie/man/chi_square_test.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{chi_square_test} -\alias{chi_square_test} -\title{Chi-square test of independence or goodness-of-fit} -\description{ - Chi-square test of independence or goodness-of-fit -} -\usage{ - chi_square_test(observed, expected) -} -\arguments{ - \item{observed}{Observed counts (matrix for independence, vector for GOF).} - \item{expected}{Expected counts for GOF (optional; uniform if NULL).} -} -\value{ - Named list: 'chi_sq', 'df', 'p_value', 'cramers_v'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/ckan_metadata.Rd b/r-package/morie/man/ckan_metadata.Rd index 8238fbc37c..b638cc492e 100644 --- a/r-package/morie/man/ckan_metadata.Rd +++ b/r-package/morie/man/ckan_metadata.Rd @@ -1,22 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} \name{ckan_metadata} \alias{ckan_metadata} -\docType{data} -\title{CKAN dataset-source metadata} -\description{ - Metadata table for the CKAN-published datasets that - \pkg{morie} can fetch and analyse, keyed by dataset short-name. +\title{CKAN Metadata for Open Data APIs} +\format{ +A data.frame with columns: +\describe{ +\item{survey}{Survey abbreviation: cpads, csads, csus} +\item{name}{Full survey name} +\item{package_id}{CKAN package UUID} +\item{metadata_url}{URL to retrieve full package metadata} +} +} +\source{ +\url{https://open.canada.ca} } \usage{ - data(ckan_metadata) +ckan_metadata } -\format{ - A data frame. One row per registered CKAN dataset with columns - describing the publishing agency, resource id, and last refresh date. +\description{ +Package IDs and metadata URLs for accessing CPADS, CSADS, and CSUS +datasets via the Canadian Open Data CKAN API. } -\keyword{datasets} \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +data(ckan_metadata) +ckan_metadata$metadata_url } +\keyword{datasets} diff --git a/r-package/morie/man/cluster_sample.Rd b/r-package/morie/man/cluster_sample.Rd deleted file mode 100644 index 832c4cf69e..0000000000 --- a/r-package/morie/man/cluster_sample.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by morie generate_rd.py -\name{cluster_sample} -\alias{cluster_sample} -\title{Two-stage cluster sampling} -\description{ - Two-stage cluster sampling - -Randomly selects 'n_clusters' clusters, then takes all units within -selected clusters. -} -\usage{ - cluster_sample(df, cluster_col, n_clusters, seed) -} -\arguments{ - \item{df}{A data frame.} - \item{cluster_col}{Name of the cluster identifier column.} - \item{n_clusters}{Number of clusters to select.} - \item{seed}{Random seed.} -} -\value{ - Data frame of selected units with '.weight' column. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/cndrc.Rd b/r-package/morie/man/cndrc.Rd index cd8a2a50b7..071f5f2138 100644 --- a/r-package/morie/man/cndrc.Rd +++ b/r-package/morie/man/cndrc.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/cndrc.R \name{cndrc} \alias{cndrc} -\alias{condorcet_winner} +\alias{morie_condorcet_winner} \title{Condorcet winner detection (Armstrong Ch 2)} \usage{ cndrc(preference_matrix) -condorcet_winner(preference_matrix) +morie_condorcet_winner(preference_matrix) } \arguments{ \item{preference_matrix}{n by n matrix where entry (i, j) = number of @@ -23,9 +23,6 @@ majority voting. Returns the index of the Condorcet winner, or -1 if none exists. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +cndrc(preference_matrix = matrix(rnorm(25), 5, 5)) } \keyword{internal} diff --git a/r-package/morie/man/cntrl_estimator.Rd b/r-package/morie/man/cntrl_estimator.Rd index d29ce740bd..a662b413b0 100644 --- a/r-package/morie/man/cntrl_estimator.Rd +++ b/r-package/morie/man/cntrl_estimator.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/cntrl.R \name{cntrl_estimator} \alias{cntrl_estimator} -\alias{control_variates} +\alias{morie_control_variates} \title{Control variates (Nelson 1990)} \usage{ cntrl_estimator(y, c_var, mu_c) -control_variates(y, c_var, mu_c) +morie_control_variates(y, c_var, mu_c) } \arguments{ \item{y}{numeric; outcome samples.} diff --git a/r-package/morie/man/cohens_d.Rd b/r-package/morie/man/cohens_d.Rd deleted file mode 100644 index 21acaa3749..0000000000 --- a/r-package/morie/man/cohens_d.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{cohens_d} -\alias{cohens_d} -\title{Cohen's d effect size} -\description{ - Cohen's d effect size -} -\usage{ - cohens_d(x1, x2, pooled) -} -\arguments{ - \item{x1}{Numeric vector (group 1).} - \item{x2}{Numeric vector (group 2).} - \item{pooled}{Use pooled SD (default 'TRUE'). If 'FALSE', uses 'sd(x2)'.} -} -\value{ - Numeric Cohen's d. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/coherence.Rd b/r-package/morie/man/coherence.Rd deleted file mode 100644 index 2f91b0629e..0000000000 --- a/r-package/morie/man/coherence.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cohrc.R -\name{coherence} -\alias{coherence} -\title{Magnitude-squared coherence between two time series} -\usage{ -coherence(x, y, nperseg = NULL, fs = 1) -} -\arguments{ -\item{x}{Numeric vector.} - -\item{y}{Numeric vector (same length).} - -\item{nperseg}{Segment length. Default n/4.} - -\item{fs}{Sampling frequency. Default 1.} -} -\value{ -Named list with \code{frequencies, coherence, n_segments, - nperseg, fs, n, method}. -} -\description{ -Magnitude-squared coherence between two time series -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/cokrg.Rd b/r-package/morie/man/cokrg.Rd index c606233fde..50d90a34e3 100644 --- a/r-package/morie/man/cokrg.Rd +++ b/r-package/morie/man/cokrg.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/cokrg.R \name{cokrg} \alias{cokrg} -\alias{cokriging} -\title{Simple cokriging for co-located bivariate spatial prediction.} +\alias{morie_cokriging} +\title{Simple morie_cokriging for co-located bivariate spatial prediction.} \usage{ cokrg( x, @@ -19,7 +19,7 @@ cokrg( nugget = 0 ) -cokriging( +morie_cokriging( x, y, coords, @@ -58,10 +58,7 @@ Named list: estimate, se, n, method. \deqn{[C_{pp} \; C_{ps}; C_{ps}^\top \; C_{ss}] [\lambda; \mu] = [c_{0p}; c_{0s}]}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +cokrg(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) } \references{ Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/compute_design_weights.Rd b/r-package/morie/man/compute_design_weights.Rd deleted file mode 100644 index 0b9bdf69bc..0000000000 --- a/r-package/morie/man/compute_design_weights.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{compute_design_weights} -\alias{compute_design_weights} -\title{Compute inverse-probability design weights} -\description{ - Compute inverse-probability design weights -} -\usage{ - compute_design_weights(df, strata_col, population_sizes) -} -\arguments{ - \item{df}{A data frame.} - \item{strata_col}{Name of the stratification column.} - \item{population_sizes}{Named integer vector: stratum level -> population size.} -} -\value{ - Numeric vector of design weights (same length as 'nrow(df)'). -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/copul.Rd b/r-package/morie/man/copul.Rd index 4e7a6e2a69..32172dbd9e 100644 --- a/r-package/morie/man/copul.Rd +++ b/r-package/morie/man/copul.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/copul.R \name{copul} \alias{copul} -\alias{copula_estimation} +\alias{morie_copula_estimation} \title{Copula parameter estimation (Gaussian/Clayton/Gumbel; Nelsen 2006)} \usage{ copul(x, y, family = c("gaussian", "clayton", "gumbel")) -copula_estimation(x, y, family = c("gaussian", "clayton", "gumbel")) +morie_copula_estimation(x, y, family = c("gaussian", "clayton", "gumbel")) } \arguments{ \item{x, y}{numeric marginal samples.} @@ -15,7 +15,7 @@ copula_estimation(x, y, family = c("gaussian", "clayton", "gumbel")) \item{family}{"gaussian", "clayton", or "gumbel".} } \value{ -list: estimate, kendall_tau, se_tau, u, v, family, n, method. +list: estimate, morie_kendall_tau, se_tau, u, v, family, n, method. } \description{ Kendall's tau inversion: diff --git a/r-package/morie/man/cpads_contract.Rd b/r-package/morie/man/cpads_contract.Rd deleted file mode 100644 index d83ed0340b..0000000000 --- a/r-package/morie/man/cpads_contract.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by morie generate_rd.py -\name{cpads_contract} -\alias{cpads_contract} -\title{Return the canonical CPADS local-data contract} -\description{ - Return the canonical CPADS local-data contract -} -\value{ - Named list describing the expected local CPADS contract. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/cramers_v.Rd b/r-package/morie/man/cramers_v.Rd deleted file mode 100644 index 485f66aea5..0000000000 --- a/r-package/morie/man/cramers_v.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{cramers_v} -\alias{cramers_v} -\title{Cramer's V for categorical association} -\description{ - Cramer's V for categorical association -} -\usage{ - cramers_v(contingency_table) -} -\arguments{ - \item{contingency_table}{A numeric matrix of observed counts.} -} -\value{ - Numeric Cramer's V in [0, 1]. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/csphr.Rd b/r-package/morie/man/csphr.Rd index d45e0454a0..166ea2a14c 100644 --- a/r-package/morie/man/csphr.Rd +++ b/r-package/morie/man/csphr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/csphr.R \name{csphr} \alias{csphr} -\alias{cutting_plane_sphere} +\alias{morie_cutting_plane_sphere} \title{Cutting plane (linear discriminant; Armstrong Ch 3)} \usage{ csphr(x, votes = NULL) -cutting_plane_sphere(x, votes = NULL) +morie_cutting_plane_sphere(x, votes = NULL) } \arguments{ \item{x}{Ideal-point matrix (n by p).} @@ -24,9 +24,6 @@ classifies legislators by Mahalanobis-style projection onto the between-class direction. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +csphr(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/dataset_catalog.Rd b/r-package/morie/man/dataset_catalog.Rd index a9e6e4d7b2..411295995b 100644 --- a/r-package/morie/man/dataset_catalog.Rd +++ b/r-package/morie/man/dataset_catalog.Rd @@ -1,23 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} \name{dataset_catalog} \alias{dataset_catalog} -\docType{data} -\title{Built-in MORIE dataset catalog} -\description{ - Table of the datasets that ship with \pkg{morie} (Canadian - carceral, police, and oversight data plus general epidemiological - reference data). Mirrors the Python-side \code{DATASET_CATALOG}. +\title{MORIE Dataset Catalog} +\format{ +A data.frame with columns: +\describe{ +\item{key}{Unique catalog key (e.g., \code{"opencanada_cpads_2021"})} +\item{name}{Human-readable dataset name} +\item{source}{Data source: opencanada, healthinfobase, or cihi} +\item{survey}{Survey abbreviation: cpads, ccs, csads, csus, or indicators} +\item{year}{Year or year range (e.g., \code{"2021-2022"})} +\item{format}{File format: csv or xlsx} +\item{type}{Data type: pumf, bootstrap, aggregate, or indicator} +\item{large_file}{Logical; TRUE for bootstrap weight files (>100MB)} +\item{local_path}{Relative path to the local data file} +\item{table_name}{SQLite table name in the DBI cache} +\item{ckan_resource_id}{CKAN DataStore resource ID (empty if unavailable)} +} +} +\source{ +Health Canada, CIHI, Statistics Canada open data portals. } \usage{ - data(dataset_catalog) +dataset_catalog } -\format{ - A data frame keyed by dataset short-name with columns describing - source, survey, year, file path, and CKAN resource identifier. +\description{ +A data.frame listing all Canadian public health datasets available +through the MORIE data management system. Each row describes one +dataset with its source, survey, year, format, and access metadata. } -\keyword{datasets} \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +data(dataset_catalog) +head(dataset_catalog) } +\keyword{datasets} diff --git a/r-package/morie/man/default_synthetic_name_map.Rd b/r-package/morie/man/default_synthetic_name_map.Rd deleted file mode 100644 index c7f8fc529f..0000000000 --- a/r-package/morie/man/default_synthetic_name_map.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by morie generate_rd.py -\name{default_synthetic_name_map} -\alias{default_synthetic_name_map} -\title{Default column-name map for synthetic data generation} -\description{ - Default synthetic-data variable name map - -Returns a named character vector mapping canonical variable keys used by -[generate_synthetic_data()] to output column names. -} -\usage{ - default_synthetic_name_map(profile) -} -\arguments{ - \item{profile}{Name profile. '"generic"' is recommended for new projects.} -} -\value{ - Named character vector. -} -\examples{ -# Default profile -nm <- default_synthetic_name_map() -head(nm) - -# Generic profile (drops domain-specific names) -nm_generic <- default_synthetic_name_map("generic") -head(nm_generic) -} diff --git a/r-package/morie/man/default_workflow_map.Rd b/r-package/morie/man/default_workflow_map.Rd deleted file mode 100644 index a578f641f9..0000000000 --- a/r-package/morie/man/default_workflow_map.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by morie generate_rd.py -\name{default_workflow_map} -\alias{default_workflow_map} -\title{Default workflow step map} -\description{ - Default workflow step map - -Returns the default named map of workflow steps to project script paths. -} -\value{ - Named character vector. -} -\examples{ -m <- default_workflow_map() -names(m) # workflow step names -m[["modules"]] # script path for the 'modules' step -} diff --git a/r-package/morie/man/design_effect.Rd b/r-package/morie/man/design_effect.Rd deleted file mode 100644 index 3b43b3ce21..0000000000 --- a/r-package/morie/man/design_effect.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{design_effect} -\alias{design_effect} -\title{Design effect (DEFF)} -\description{ - Design effect (DEFF) -} -\usage{ - design_effect(weights) -} -\arguments{ - \item{weights}{Numeric vector of sampling weights.} -} -\value{ - Numeric design effect (= n / ESS). -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/dimrd.Rd b/r-package/morie/man/dimrd.Rd index ec5b8219c1..9781c5966f 100644 --- a/r-package/morie/man/dimrd.Rd +++ b/r-package/morie/man/dimrd.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/dimrd.R \name{dimrd} \alias{dimrd} -\alias{dimensionality_test} +\alias{morie_dimensionality_test} \title{Dimensionality test via Kaiser scree (Armstrong Ch 7)} \usage{ dimrd(x, threshold = 1) -dimensionality_test(x, threshold = 1) +morie_dimensionality_test(x, threshold = 1) } \arguments{ \item{x}{Numeric matrix (n by m).} @@ -24,9 +24,6 @@ formed from x (input matrix is used directly if symmetric; else uses the column correlation matrix). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +dimrd(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/dot-siu_curl_version.Rd b/r-package/morie/man/dot-siu_curl_version.Rd new file mode 100644 index 0000000000..8480ce01a3 --- /dev/null +++ b/r-package/morie/man/dot-siu_curl_version.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_curl_version} +\alias{.siu_curl_version} +\title{libcurl version string morie was built against} +\usage{ +.siu_curl_version() +} +\value{ +A length-1 character vector. +} +\description{ +libcurl version string morie was built against +} +\keyword{internal} diff --git a/r-package/morie/man/dot-siu_http_get.Rd b/r-package/morie/man/dot-siu_http_get.Rd new file mode 100644 index 0000000000..7af84a127c --- /dev/null +++ b/r-package/morie/man/dot-siu_http_get.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_http_get} +\alias{.siu_http_get} +\title{Fetch a single URL over HTTP(S) via libcurl} +\usage{ +.siu_http_get(url, timeout_s = 60L) +} +\arguments{ +\item{url}{URL to fetch.} + +\item{timeout_s}{Request timeout in seconds.} +} +\value{ +The response body as a length-1 character vector. +} +\description{ +Internal building block of the SIU parser. Returns the response +body, or an empty string on any transport-level failure. +} +\keyword{internal} diff --git a/r-package/morie/man/dot-siu_http_get_many.Rd b/r-package/morie/man/dot-siu_http_get_many.Rd new file mode 100644 index 0000000000..b78a4199cf --- /dev/null +++ b/r-package/morie/man/dot-siu_http_get_many.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_http_get_many} +\alias{.siu_http_get_many} +\title{Fetch many URLs concurrently via libcurl, with rate-limiting + retry} +\usage{ +.siu_http_get_many( + urls, + concurrency = 4L, + timeout_s = 60L, + rate_rps = 4, + max_retries = 3L +) +} +\arguments{ +\item{urls}{Character vector of URLs.} + +\item{concurrency}{Maximum simultaneous transfers.} + +\item{timeout_s}{Per-request timeout in seconds.} + +\item{rate_rps}{Maximum request starts per second across the pool. +Default \code{4.0} is a polite scrape rate that stays well under +any common WAF threshold. Set very large (e.g. \code{1e9}) to +effectively disable throttling.} + +\item{max_retries}{Maximum retry attempts per URL on 429/5xx / +transport failure.} +} +\value{ +A character vector of response bodies, parallel to \code{urls}. +} +\description{ +Drives up to \code{concurrency} simultaneous transfers, but with a +global token-bucket limit of \code{rate_rps} request starts per +second across the whole pool. HTTP 429/502/503/504 and transport +errors are retried up to \code{max_retries} times with exponential +backoff (250ms * 2^attempt). Final failures yield an empty string +at their slot. +} +\details{ +Throttling is the safe default for SIU and similar small-gov +endpoints: hammering them with 16-24 concurrent requests triggers +WAF/Cloudflare-style bot-protection that returns short +interstitial pages, which look like data but aren't. +} +\keyword{internal} diff --git a/r-package/morie/man/dot-siu_http_get_many_with_status.Rd b/r-package/morie/man/dot-siu_http_get_many_with_status.Rd new file mode 100644 index 0000000000..e64e7848df --- /dev/null +++ b/r-package/morie/man/dot-siu_http_get_many_with_status.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_http_get_many_with_status} +\alias{.siu_http_get_many_with_status} +\title{Fetch many URLs and return body + http_code + attempts} +\usage{ +.siu_http_get_many_with_status( + urls, + concurrency = 4L, + timeout_s = 60L, + rate_rps = 4, + max_retries = 3L +) +} +\value{ +A list with three parallel slots: \code{body} (character), +\code{http_code} (integer), \code{attempts} (integer). +} +\description{ +Same throttle/retry behaviour as \code{.siu_http_get_many} but the +return value preserves the HTTP status code and attempt count for +each URL, so callers can distinguish a healthy 200 with a small +body from a 429/503/short interstitial. Used by the DRID manifest +builder (\code{morie_siu_refresh_manifest}). +} +\keyword{internal} diff --git a/r-package/morie/man/dot-siu_parse_news.Rd b/r-package/morie/man/dot-siu_parse_news.Rd new file mode 100644 index 0000000000..7db3be9c9b --- /dev/null +++ b/r-package/morie/man/dot-siu_parse_news.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_parse_news} +\alias{.siu_parse_news} +\title{Parse one SIU news-release HTML page} +\usage{ +.siu_parse_news(html, nrid, url) +} +\arguments{ +\item{html}{The news-release page HTML.} + +\item{nrid}{The news-release id.} + +\item{url}{The source URL of the news-release page.} +} +\value{ +A named character vector: nrid, source_url_news, +news_release_title, news_release_date_iso, news_release_date_raw, +news_release_summary. +} +\description{ +Parse one SIU news-release HTML page +} +\keyword{internal} diff --git a/r-package/morie/man/dot-siu_parse_report.Rd b/r-package/morie/man/dot-siu_parse_report.Rd new file mode 100644 index 0000000000..9a182869b9 --- /dev/null +++ b/r-package/morie/man/dot-siu_parse_report.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{.siu_parse_report} +\alias{.siu_parse_report} +\title{Parse one SIU director's-report HTML page into the 64-column schema} +\usage{ +.siu_parse_report(html, drid, url) +} +\arguments{ +\item{html}{The report page HTML.} + +\item{drid}{The director's-report id.} + +\item{url}{The source URL of the report page.} +} +\value{ +A named character vector with the 64 SIU dataset columns; +report-derived fields are populated, news fields left empty. +} +\description{ +Parse one SIU director's-report HTML page into the 64-column schema +} +\keyword{internal} diff --git a/r-package/morie/man/dwnmn.Rd b/r-package/morie/man/dwnmn.Rd index ded75ed6ba..53a9babeb4 100644 --- a/r-package/morie/man/dwnmn.Rd +++ b/r-package/morie/man/dwnmn.Rd @@ -2,16 +2,16 @@ % Please edit documentation in R/dwnmn.R \name{dwnmn} \alias{dwnmn} -\alias{dynamic_wnominate} +\alias{morie_dynamic_wnominate} \title{Dynamic ideal points / random-walk smoother (Armstrong Ch 6)} \usage{ dwnmn(x, sigma_w = 0.1) -dynamic_wnominate(x, sigma_w = 0.1) +morie_dynamic_wnominate(x, sigma_w = 0.1) } \arguments{ \item{x}{Numeric vector (per-period ideal points) or matrix -(n_legislators by T).} +(n_legislators by n_t).} \item{sigma_w}{Random-walk innovation SD.} } @@ -24,9 +24,6 @@ Kalman + RTS smoother applied to a per-period scalar ideal-point series (or a panel of legislators). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +dwnmn(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/e_value.Rd b/r-package/morie/man/e_value.Rd deleted file mode 100644 index 9398a57e10..0000000000 --- a/r-package/morie/man/e_value.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{e_value} -\alias{e_value} -\title{Compute E-value for unmeasured confounding} -\description{ -Compute the E-value for unmeasured confounding (VanderWeele and Ding, 2017). - -The E-value quantifies the minimum strength of confounding association -needed to fully explain away an observed treatment effect. -For a risk ratio less than 1, use its reciprocal before applying the formula. -} -\usage{ -e_value(rr, rr_lower) -} -\arguments{ -\item{rr}{Risk ratio estimate (> 0). Supply > 1; if < 1, pass its reciprocal.} -\item{rr_lower}{Lower bound of the 95\% CI (used to compute E-value for CI).} -} -\value{ -Named list with components \code{e_value} and \code{e_value_ci}. -} -\examples{ -e_value(rr = 3.9, rr_lower = 2.4) -} diff --git a/r-package/morie/man/effective_sample_size.Rd b/r-package/morie/man/effective_sample_size.Rd deleted file mode 100644 index 3c2870c7c3..0000000000 --- a/r-package/morie/man/effective_sample_size.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{effective_sample_size} -\alias{effective_sample_size} -\title{Kish effective sample size} -\description{ - Kish effective sample size -} -\usage{ - effective_sample_size(weights) -} -\arguments{ - \item{weights}{Numeric vector of sampling weights.} -} -\value{ - Numeric ESS. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/estimate_aipw.Rd b/r-package/morie/man/estimate_aipw.Rd deleted file mode 100644 index 590dbd9b7f..0000000000 --- a/r-package/morie/man/estimate_aipw.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_aipw} -\alias{estimate_aipw} -\title{Augmented IPW (AIPW) doubly-robust ATE estimator} -\description{ - Augmented IPW (AIPW) doubly-robust ATE estimator. - - Combines IPW and outcome regression corrections. Consistent if - \strong{either} the propensity model \strong{or} the outcome model - is correctly specified. -} -\usage{ - estimate_aipw(data, treatment, outcome, covariates, - propensity_col = NULL, - outcome_model = c("linear", "logistic")) -} -\arguments{ - \item{data}{A data frame containing treatment, outcome, and covariate columns.} - \item{treatment}{Name of the binary treatment column (0/1).} - \item{outcome}{Name of the outcome column.} - \item{covariates}{Character vector of covariate column names.} - \item{propensity_col}{Optional name of a pre-computed propensity column. - If NULL, propensity is fit via logistic regression on \code{covariates}.} - \item{outcome_model}{Family for the outcome model: \code{"linear"} or \code{"logistic"}.} -} -\value{ - Named list: \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n}. -} -\examples{ -set.seed(2026) -n <- 200L -x <- rnorm(n) -D <- rbinom(n, 1, plogis(0.5 * x)) -y <- 0.7 * D + 0.3 * x + rnorm(n) -df <- data.frame(D = D, y = y, age = x) -res <- estimate_aipw(df, treatment = "D", outcome = "y", - covariates = "age") -res$ate -res$ci_lower; res$ci_upper -} diff --git a/r-package/morie/man/estimate_atc.Rd b/r-package/morie/man/estimate_atc.Rd deleted file mode 100644 index 145744a4fa..0000000000 --- a/r-package/morie/man/estimate_atc.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_atc} -\alias{estimate_atc} -\title{Estimate the Average Treatment Effect on the Controls (ATC)} -\description{ - Estimate the Average Treatment Effect on the Controls (ATC) - -Control units receive weight 1; treated units receive -\eqn{w_i = (1-\hat{e}(X_i))/\hat{e}(X_i)}. -} -\value{ - Named list: 'atc', 'se', 'ci_lower', 'ci_upper', 'n_control'. -} -\examples{ -set.seed(2026) -n <- 200L -x <- rnorm(n) -D <- rbinom(n, 1, plogis(0.5 * x)) -y <- 0.7 * D + 0.3 * x + rnorm(n) -df <- data.frame(D = D, y = y, age = x) -res <- estimate_atc(df, treatment = "D", outcome = "y", - covariates = "age") -res$atc -} diff --git a/r-package/morie/man/estimate_ate.Rd b/r-package/morie/man/estimate_ate.Rd deleted file mode 100644 index c79e324b21..0000000000 --- a/r-package/morie/man/estimate_ate.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_ate} -\alias{estimate_ate} -\title{Estimate the Average Treatment Effect (ATE) via Hajek IPW} -\description{ - Estimate the Average Treatment Effect (ATE) via Hajek IPW - -The Hajek estimator uses stabilised IPW weights: -\deqn{\widehat{ATE} = \bar{y}_1^{w} - \bar{y}_0^{w}} -where \eqn{\bar{y}_t^{w} = \sum_{T_i=t} w_i Y_i / \sum_{T_i=t} w_i} -and \eqn{w_i = T_i/\hat{e}(X_i) + (1-T_i)/(1-\hat{e}(X_i))}. -} -\usage{ - estimate_ate(data, treatment, outcome, covariates, propensity_col) -} -\arguments{ - \item{data}{A data frame.} - \item{treatment}{Name of the binary treatment column.} - \item{outcome}{Name of the outcome column.} - \item{covariates}{Character vector of covariate names.} - \item{propensity_col}{Optional: name of a pre-computed propensity score column.} -} -\value{ - Named list: 'ate', 'se', 'ci_lower', 'ci_upper', 'n', 'ess'. -} -\examples{ - set.seed(1) - df <- data.frame( - t = rbinom(200, 1, 0.4), - y = rnorm(200), - x = rnorm(200) - ) - estimate_ate(df, "t", "y", "x") -} diff --git a/r-package/morie/man/estimate_att.Rd b/r-package/morie/man/estimate_att.Rd deleted file mode 100644 index 39421b0b18..0000000000 --- a/r-package/morie/man/estimate_att.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_att} -\alias{estimate_att} -\title{Estimate the Average Treatment Effect on the Treated (ATT)} -\description{ - Estimate the Average Treatment Effect on the Treated (ATT) - -Treated units receive weight 1; controls receive -\eqn{w_i = \hat{e}(X_i)/(1-\hat{e}(X_i))}. -} -\value{ - Named list: 'att', 'se', 'ci_lower', 'ci_upper', 'n_treated'. -} -\examples{ - set.seed(2) - df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) - estimate_att(df, "t", "y", "x") -} diff --git a/r-package/morie/man/estimate_cate.Rd b/r-package/morie/man/estimate_cate.Rd deleted file mode 100644 index f2947fc922..0000000000 --- a/r-package/morie/man/estimate_cate.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_cate} -\alias{estimate_cate} -\title{Estimate per-unit Conditional Average Treatment Effects (CATE)} -\description{ -Estimate per-unit Conditional Average Treatment Effects via either a -T-learner or an S-learner meta-learner. - -The \strong{T-learner} fits separate outcome models on treated and -control units, then predicts the counterfactual for each unit: -\eqn{\widehat{CATE}_i = \hat{\mu}_1(X_i) - \hat{\mu}_0(X_i)}. - -The \strong{S-learner} fits one model with treatment as a feature and -predicts \eqn{\hat{\mu}(X_i, 1) - \hat{\mu}(X_i, 0)} per unit. -} -\usage{ -estimate_cate(data, treatment, outcome, covariates, - propensity_col = NULL, - outcome_model = c("linear", "logistic"), - meta_learner = c("t_learner", "s_learner")) -} -\arguments{ -\item{data}{A data frame containing the treatment, outcome, and covariates.} -\item{treatment}{Name of the binary treatment column in \code{data}.} -\item{outcome}{Name of the outcome column in \code{data}.} -\item{covariates}{Character vector of covariate column names in \code{data}.} -\item{propensity_col}{Optional name of a pre-computed propensity-score -column. If \code{NULL}, propensities are estimated internally.} -\item{outcome_model}{Outcome-model family: \code{"linear"} (default) for -continuous outcomes or \code{"logistic"} for binary outcomes.} -\item{meta_learner}{Meta-learner: \code{"t_learner"} (default) or -\code{"s_learner"}.} -} -\value{ -A data frame with one row per unit in \code{data}, containing per-unit -CATE estimates and supporting columns. -} -\examples{ -\donttest{ -df <- data.frame( - y = rnorm(100), - z = sample(0:1, 100, replace = TRUE), - x1 = rnorm(100), x2 = rnorm(100) -) -estimate_cate(df, treatment = "z", outcome = "y", - covariates = c("x1", "x2")) -} -} diff --git a/r-package/morie/man/estimate_g_computation.Rd b/r-package/morie/man/estimate_g_computation.Rd deleted file mode 100644 index b076651fd7..0000000000 --- a/r-package/morie/man/estimate_g_computation.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_g_computation} -\alias{estimate_g_computation} -\title{G-computation (outcome regression) ATE estimator} -\description{ - G-computation (outcome regression) ATE estimator - -Estimates the ATE by: -\deqn{\widehat{ATE} = \frac{1}{n}\sum_i \bigl[\hat{\mu}_1(X_i) - \hat{\mu}_0(X_i)\bigr]} -} -\value{ - Named list: 'ate', 'se', 'ci_lower', 'ci_upper'. -} -\examples{ -set.seed(2026) -n <- 200L -x <- rnorm(n) -D <- rbinom(n, 1, plogis(0.5 * x)) -y <- 0.7 * D + 0.3 * x + rnorm(n) -df <- data.frame(D = D, y = y, age = x) -res <- estimate_g_computation(df, treatment = "D", - outcome = "y", - covariates = "age") -res$ate -} diff --git a/r-package/morie/man/estimate_gate.Rd b/r-package/morie/man/estimate_gate.Rd deleted file mode 100644 index 01b7983b2e..0000000000 --- a/r-package/morie/man/estimate_gate.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_gate} -\alias{estimate_gate} -\title{Estimate Group Average Treatment Effects (GATE)} -\description{ -Estimate Group Average Treatment Effects by applying AIPW within each -level of \code{group_col} to obtain stratum-specific treatment-effect -estimates. -} -\usage{ -estimate_gate(data, treatment, outcome, covariates, group_col, - propensity_col = NULL, - outcome_model = c("linear", "logistic")) -} -\arguments{ -\item{data}{A data frame containing the treatment, outcome, covariates, -and grouping column.} -\item{treatment}{Name of the binary treatment column in \code{data}.} -\item{outcome}{Name of the outcome column in \code{data}.} -\item{covariates}{Character vector of covariate column names.} -\item{group_col}{Name of the grouping variable -(e.g. \code{"gender"}, \code{"region"}).} -\item{propensity_col}{Optional name of a pre-computed propensity-score -column. If \code{NULL}, propensities are estimated internally.} -\item{outcome_model}{Outcome-model family: \code{"linear"} (default) for -continuous outcomes or \code{"logistic"} for binary outcomes.} -} -\value{ -A data frame with one row per group level, containing the columns -\code{group}, \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n}. -} -\examples{ -\donttest{ -set.seed(3) -df <- data.frame( - t = rbinom(300, 1, 0.4), - y = rnorm(300), - x = rnorm(300), - g = sample(c("A", "B"), 300, replace = TRUE) -) -estimate_gate(df, treatment = "t", outcome = "y", - covariates = "x", group_col = "g") -} -} diff --git a/r-package/morie/man/estimate_late.Rd b/r-package/morie/man/estimate_late.Rd deleted file mode 100644 index 96952d1237..0000000000 --- a/r-package/morie/man/estimate_late.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_late} -\alias{estimate_late} -\title{Local Average Treatment Effect (LATE) via 2SLS / Wald} -\description{ - Estimate the Local Average Treatment Effect (LATE) via 2SLS / Wald - -Uses a binary instrument \eqn{Z} to identify the LATE (Imbens & Angrist, 1994): -\deqn{LATE = \frac{Cov(Y, Z)}{Cov(T, Z)}} - -With covariates, uses two-stage OLS (Wald within residuals). -Requires 'ivreg::ivreg()' if available; otherwise falls back to the -closed-form Wald estimator. -} -\usage{ - estimate_late(data, treatment, outcome, instrument, covariates) -} -\arguments{ - \item{data}{A data frame.} - \item{treatment}{Name of the binary endogenous treatment column.} - \item{outcome}{Name of the outcome column.} - \item{instrument}{Name of the binary instrument column.} - \item{covariates}{Optional character vector of exogenous covariates.} -} -\value{ - Named list: 'late', 'se', 'ci_lower', 'ci_upper', -} -\examples{ -set.seed(2026) -n <- 300L -z <- rbinom(n, 1, 0.5) -# Compliance: treatment status depends on instrument z + covariate x -x <- rnorm(n) -D <- rbinom(n, 1, plogis(0.5 + z * 1.5 + 0.3 * x)) -y <- 0.7 * D + 0.3 * x + rnorm(n) -df <- data.frame(D = D, y = y, age = x, z = z) -res <- estimate_late(df, treatment = "D", outcome = "y", - instrument = "z", covariates = "age") -res$late -} diff --git a/r-package/morie/man/estimate_propensity_scores.Rd b/r-package/morie/man/estimate_propensity_scores.Rd deleted file mode 100644 index a5a425370e..0000000000 --- a/r-package/morie/man/estimate_propensity_scores.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{estimate_propensity_scores} -\alias{estimate_propensity_scores} -\title{Estimate propensity scores via logistic regression} -\description{ - Estimate propensity scores via logistic regression -} -\usage{ - estimate_propensity_scores(data, treatment, covariates, trim) -} -\arguments{ - \item{data}{A data frame.} - \item{treatment}{Name of the binary treatment column.} - \item{covariates}{Character vector of covariate names.} - \item{trim}{Quantile pair used to winsorize extreme scores (default 0.01, 0.99).} -} -\value{ - Numeric vector of propensity scores (same length as 'nrow(data)'). -} -\examples{ - df <- data.frame(t = c(0,1,0,1,0,1), x = rnorm(6)) - ps <- estimate_propensity_scores(df, "t", "x") -} diff --git a/r-package/morie/man/eta_squared.Rd b/r-package/morie/man/eta_squared.Rd deleted file mode 100644 index dafeff1036..0000000000 --- a/r-package/morie/man/eta_squared.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{eta_squared} -\alias{eta_squared} -\title{Eta-squared from F-statistic} -\description{ - Eta-squared from F-statistic -} -\usage{ - eta_squared(f_stat, df_between, df_within) -} -\arguments{ - \item{f_stat}{F statistic.} - \item{df_between}{Degrees of freedom (numerator).} - \item{df_within}{Degrees of freedom (denominator).} -} -\value{ - Numeric eta-squared. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/extvm.Rd b/r-package/morie/man/extvm.Rd index 3b1a13016c..9c5700b68b 100644 --- a/r-package/morie/man/extvm.Rd +++ b/r-package/morie/man/extvm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/extvm.R \name{extvm} \alias{extvm} -\alias{extreme_value_gev} +\alias{morie_extreme_value_gev} \title{Generalised Extreme Value fit by ML (Coles 2001)} \usage{ extvm(x) -extreme_value_gev(x) +morie_extreme_value_gev(x) } \arguments{ \item{x}{numeric vector of block maxima.} diff --git a/r-package/morie/man/find_project_root.Rd b/r-package/morie/man/find_project_root.Rd deleted file mode 100644 index 7cf204edb7..0000000000 --- a/r-package/morie/man/find_project_root.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by morie generate_rd.py -\name{find_project_root} -\alias{find_project_root} -\title{Find a project root directory} -\description{ - Find a project root directory - -Searches upward from 'start' for a directory containing the current -Sphinx/package-root markers, while still tolerating legacy Quarto-era -markers in older checkouts. -} -\usage{ - find_project_root(start, max_up) -} -\arguments{ - \item{start}{Starting directory.} - \item{max_up}{Maximum number of parent traversals.} -} -\value{ - Absolute path to the detected project root. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/fisher_exact_test.Rd b/r-package/morie/man/fisher_exact_test.Rd deleted file mode 100644 index 07032fd164..0000000000 --- a/r-package/morie/man/fisher_exact_test.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{fisher_exact_test} -\alias{fisher_exact_test} -\title{Fisher's exact test for 2x2 tables} -\description{ - Fisher's exact test for 2x2 tables -} -\usage{ - fisher_exact_test(table_2x2, alternative) -} -\arguments{ - \item{table_2x2}{A 2x2 matrix or data frame of counts.} - \item{alternative}{'"two.sided"', '"greater"', or '"less"'.} -} -\value{ - Named list: 'odds_ratio', 'ci', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/frns_metrics.Rd b/r-package/morie/man/frns_metrics.Rd index ef99a3637e..307bddc7de 100644 --- a/r-package/morie/man/frns_metrics.Rd +++ b/r-package/morie/man/frns_metrics.Rd @@ -3,6 +3,11 @@ \name{frns_metrics} \alias{frns_metrics} \title{Group-disparity metrics for auditing classification and risk systems} +\value{ +Each callable in this module returns a named \code{list} with the +metric \code{value}, a per-group breakdown, advisory \code{warnings}, and +a plain-language \code{interpretation}. +} \description{ R parity for the Python \code{morie.fairness.metrics} module. Every callable here is an \emph{audit} measure: given the decisions a system @@ -14,12 +19,12 @@ only measure disparity in predictions that already exist. \details{ Functions: \itemize{ -\item \code{fairness_disparate_impact()}: the EEOC four-fifths rule. -\item \code{fairness_demographic_parity()}: favourable-rate gap. -\item \code{fairness_equalized_odds()}: TPR/FPR gaps (needs ground truth). -\item \code{fairness_average_odds_difference()}: mean TPR+FPR gap. -\item \code{fairness_gini()}: concentration of a score distribution. -\item \code{fairness_bias_amplification()}: composite \code{Delta_parity * G}. +\item \code{morie_fairness_disparate_impact()}: the EEOC four-fifths rule. +\item \code{morie_fairness_demographic_parity()}: favourable-rate gap. +\item \code{morie_fairness_equalized_odds()}: TPR/FPR gaps (needs ground truth). +\item \code{morie_fairness_average_odds_difference()}: mean TPR+FPR gap. +\item \code{morie_fairness_gini()}: concentration of a score distribution. +\item \code{morie_fairness_bias_amplification()}: composite \code{Delta_parity * G}. } Each returns a named \code{list} with the metric value, a per-group @@ -33,3 +38,8 @@ Fairness 360 metric definitions; the COMPAS audit in pbiecek's (Lacherade, Szabo, Krikava & Aeby, 2021); and Barman & Barman, arXiv:2603.18987 (the Bias Amplification Score). } +\examples{ +pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0) +race <- c(rep("A", 5), rep("B", 5)) +morie_fairness_disparate_impact(pred, race, privileged = "A")$value +} diff --git a/r-package/morie/man/frns_predpol.Rd b/r-package/morie/man/frns_predpol.Rd index 0c25ce07e4..ca340c32d8 100644 --- a/r-package/morie/man/frns_predpol.Rd +++ b/r-package/morie/man/frns_predpol.Rd @@ -3,6 +3,13 @@ \name{frns_predpol} \alias{frns_predpol} \title{Generalised predictive-policing disparity audit} +\value{ +\code{morie_predpol_aggregate_areas()} returns a per-area +\code{data.frame}; \code{morie_predpol_calibration_audit()} and +\code{morie_predpol_score_disparity()} return named \code{list}s of audit +statistics, per-group breakdowns, and a plain-language +\code{interpretation}. +} \description{ R parity for the Python \code{morie.fairness.predpol} module. A clean-room, city-agnostic reimplementation of the district-level @@ -15,14 +22,21 @@ composition. \details{ Functions: \itemize{ -\item \code{predpol_aggregate_areas()}: roll per-record data up to one row +\item \code{morie_predpol_aggregate_areas()}: roll per-record data up to one row per area. -\item \code{predpol_calibration_audit()}: Spearman calibration plus a +\item \code{morie_predpol_calibration_audit()}: Spearman calibration plus a per-group mean rank gap (the over-/under-prediction signal). -\item \code{predpol_score_disparity()}: descriptive per-group risk-score +\item \code{morie_predpol_score_disparity()}: descriptive per-group risk-score summary with a one-way ANOVA. } Written from the project's published methodology; no code copied (that repository carries no licence and is not redistributable). } +\examples{ +agg <- morie_predpol_aggregate_areas( + area = c("a", "a", "b", "b"), risk = c(10, 20, 30, 40), + outcome = c(1, 0, 1, 1) +) +agg$mean_risk +} diff --git a/r-package/morie/man/frns_temporal.Rd b/r-package/morie/man/frns_temporal.Rd index baefce9fb8..a4b46747e0 100644 --- a/r-package/morie/man/frns_temporal.Rd +++ b/r-package/morie/man/frns_temporal.Rd @@ -3,6 +3,11 @@ \name{frns_temporal} \alias{frns_temporal} \title{Multi-city temporal disparity audit} +\value{ +The module's audit callable returns a named \code{list} with the +worst per-city Disparate Impact Ratio range, per-city and per-cell +breakdowns, and a plain-language \code{interpretation}. +} \description{ R parity for the Python \code{morie.fairness.temporal} module. The four disparity metrics — Disparate Impact Ratio, Demographic Parity Gap, @@ -16,3 +21,10 @@ arXiv:2603.18987. Its central lesson: bias metrics are not stable from one deployment cycle to the next and must be recomputed per period and per city. } +\examples{ +period <- c(rep("p1", 10), rep("p2", 10)) +city <- rep("A", 20) +pred <- rep(c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0), 2) +grp <- rep(c(rep("X", 5), rep("Y", 5)), 2) +morie_predpol_temporal_audit(period, city, pred, grp, privileged = "X") +} diff --git a/r-package/morie/man/fzbrd.Rd b/r-package/morie/man/fzbrd.Rd index 01c12e9e40..6bbd176948 100644 --- a/r-package/morie/man/fzbrd.Rd +++ b/r-package/morie/man/fzbrd.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzbrd.R \name{fzbrd} \alias{fzbrd} -\alias{fauzi_bias_reduced_kdfe} +\alias{morie_fauzi_bias_reduced_kdfe} \title{Fauzi: Bias-reduced KDFE via geometric extrapolation (Ch 2)} \usage{ fzbrd(x, t = NULL, h = NULL, c = 2) -fauzi_bias_reduced_kdfe(x, t = NULL, h = NULL, c = 2) +morie_fauzi_bias_reduced_kdfe(x, t = NULL, h = NULL, c = 2) } \arguments{ \item{x}{Numeric vector.} @@ -26,9 +26,6 @@ Richardson-style extrapolation cancels the O(h^2) bias of the KDFE: \deqn{\hat F_{br}(t) = (c^2 \hat F_h(t) - \hat F_{ch}(t))/(c^2-1).} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzbrd(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzcvm.Rd b/r-package/morie/man/fzcvm.Rd index 33afbce4e0..7469105355 100644 --- a/r-package/morie/man/fzcvm.Rd +++ b/r-package/morie/man/fzcvm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzcvm.R \name{fzcvm} \alias{fzcvm} -\alias{fauzi_cvm_smoothed} +\alias{morie_fauzi_cvm_smoothed} \title{Fauzi: Cramer-von Mises test with kernel-smoothed CDF (Ch 5)} \usage{ fzcvm(x, cdf = "norm", args = NULL, h = NULL) -fauzi_cvm_smoothed(x, cdf = "norm", args = NULL, h = NULL) +morie_fauzi_cvm_smoothed(x, cdf = "norm", args = NULL, h = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -25,9 +25,6 @@ Named list: statistic, p_value, h, n, method. \eqn{W_n^2 = n \int (\hat F_h(t)-F_0(t))^2 dF_0(t)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzcvm(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzedg.Rd b/r-package/morie/man/fzedg.Rd index eb95318219..420abf9ee2 100644 --- a/r-package/morie/man/fzedg.Rd +++ b/r-package/morie/man/fzedg.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzedg.R \name{fzedg} \alias{fzedg} -\alias{fauzi_edgeworth_quantile} +\alias{morie_fauzi_edgeworth_quantile} \title{Fauzi: Edgeworth expansion for kernel quantile (Ch 3)} \usage{ fzedg(x, z = 1.96, p = 0.5) -fauzi_edgeworth_quantile(x, z = 1.96, p = 0.5) +morie_fauzi_edgeworth_quantile(x, z = 1.96, p = 0.5) } \arguments{ \item{x}{Numeric vector (only n is used; result is asymptotic).} @@ -27,9 +27,6 @@ studentised kernel quantile. Skewness of the indicator score \deqn{P(T_n \le z) \approx \Phi(z) - (\gamma_1/6)(z^2-1)\phi(z)/\sqrt n.} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzedg(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzhdc.Rd b/r-package/morie/man/fzhdc.Rd index f93bea8423..a25a84df95 100644 --- a/r-package/morie/man/fzhdc.Rd +++ b/r-package/morie/man/fzhdc.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzhdc.R \name{fzhdc} \alias{fzhdc} -\alias{fauzi_h_decomposition} +\alias{morie_fauzi_h_decomposition} \title{Fauzi: Hoeffding (H-) decomposition of a degree-2 U-statistic (Ch 5)} \usage{ fzhdc(x, kernel = NULL, max_pairs = 2000L, seed = 0L) -fauzi_h_decomposition(x, kernel = NULL, max_pairs = 2000L, seed = 0L) +morie_fauzi_h_decomposition(x, kernel = NULL, max_pairs = 2000L, seed = 0L) } \arguments{ \item{x}{Numeric vector.} @@ -28,9 +28,6 @@ For a symmetric kernel g(x1,x2), \eqn{\mathrm{Var}(U_n) \approx 4\sigma_1^2/n}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzhdc(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzhok.Rd b/r-package/morie/man/fzhok.Rd index 5080331109..36e513d554 100644 --- a/r-package/morie/man/fzhok.Rd +++ b/r-package/morie/man/fzhok.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzhok.R \name{fzhok} \alias{fzhok} -\alias{fauzi_higher_order_kernel} +\alias{morie_fauzi_higher_order_kernel} \title{Fauzi: Higher-order (order-4) Gaussian-based kernel (Ch 1)} \usage{ fzhok(x, t = NULL, h = NULL, order = 4L) -fauzi_higher_order_kernel(x, t = NULL, h = NULL, order = 4L) +morie_fauzi_higher_order_kernel(x, t = NULL, h = NULL, order = 4L) } \arguments{ \item{x}{Numeric vector.} @@ -27,9 +27,6 @@ KDE with order-4 Wand-Jones (1995, eq 2.8) kernel to O(h^4). Note: K_4 takes negative values so f_hat may be < 0. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzhok(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzkdf.Rd b/r-package/morie/man/fzkdf.Rd index ba3484bf6d..208cf63cd2 100644 --- a/r-package/morie/man/fzkdf.Rd +++ b/r-package/morie/man/fzkdf.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzkdf.R \name{fzkdf} \alias{fzkdf} -\alias{fauzi_kdfe_properties} +\alias{morie_fauzi_kdfe_properties} \title{Fauzi: KDFE bias and variance properties (Ch 2)} \usage{ fzkdf(x, t = NULL, h = NULL) -fauzi_kdfe_properties(x, t = NULL, h = NULL) +morie_fauzi_kdfe_properties(x, t = NULL, h = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -28,9 +28,6 @@ bias and variance: For the Gaussian kernel \eqn{\mu_2=1, r(K)=1/(2\sqrt\pi)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzkdf(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzksm.Rd b/r-package/morie/man/fzksm.Rd index b10af3816d..7ea33cbb2a 100644 --- a/r-package/morie/man/fzksm.Rd +++ b/r-package/morie/man/fzksm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzksm.R \name{fzksm} \alias{fzksm} -\alias{fauzi_ks_smoothed} +\alias{morie_fauzi_ks_smoothed} \title{Fauzi: Kolmogorov-Smirnov test with kernel-smoothed CDF (Ch 5)} \usage{ fzksm(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) -fauzi_ks_smoothed(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) +morie_fauzi_ks_smoothed(x, cdf = "norm", args = NULL, h = NULL, n_grid = 512L) } \arguments{ \item{x}{Numeric vector.} @@ -28,9 +28,6 @@ Named list: statistic, p_value, h, n, method. \eqn{D_n = \sup_t |\hat F_h(t) - F_0(t)|}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzksm(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzlst.Rd b/r-package/morie/man/fzlst.Rd index 6fa26de58b..55c0937a97 100644 --- a/r-package/morie/man/fzlst.Rd +++ b/r-package/morie/man/fzlst.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzlst.R \name{fzlst} \alias{fzlst} -\alias{fauzi_l_statistic} +\alias{morie_fauzi_l_statistic} \title{Fauzi: L-statistic for kernel functionals (Ch 5)} \usage{ fzlst(x, score = NULL, n_quad = 200L) -fauzi_l_statistic(x, score = NULL, n_quad = 200L) +morie_fauzi_l_statistic(x, score = NULL, n_quad = 200L) } \arguments{ \item{x}{Numeric vector.} @@ -25,9 +25,6 @@ Named list with estimate, se, n, method. Default J(u)=1 gives the sample mean. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzlst(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzmis.Rd b/r-package/morie/man/fzmis.Rd index bd7c71d822..7a7dc3d5c7 100644 --- a/r-package/morie/man/fzmis.Rd +++ b/r-package/morie/man/fzmis.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzmis.R \name{fzmis} \alias{fzmis} -\alias{fauzi_mise_computation} +\alias{morie_fauzi_mise_computation} \title{Fauzi: MISE decomposition for Gaussian KDE (Ch 1)} \usage{ fzmis(x, h = NULL) -fauzi_mise_computation(x, h = NULL) +morie_fauzi_mise_computation(x, h = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -24,9 +24,6 @@ With normal-reference plug-in \eqn{R(f'')=3/(8\sqrt\pi \sigma^5)}. Returns h_opt minimising MISE. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzmis(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzmrb.Rd b/r-package/morie/man/fzmrb.Rd index 17c6fb2d6a..d495cd2d2a 100644 --- a/r-package/morie/man/fzmrb.Rd +++ b/r-package/morie/man/fzmrb.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzmrb.R \name{fzmrb} \alias{fzmrb} -\alias{fauzi_mrl_boundary_free} +\alias{morie_fauzi_mrl_boundary_free} \title{Fauzi: Boundary-free MRL via log-bijection (Ch 4)} \usage{ fzmrb(x, t = NULL, h = NULL) -fauzi_mrl_boundary_free(x, t = NULL, h = NULL) +morie_fauzi_mrl_boundary_free(x, t = NULL, h = NULL) } \arguments{ \item{x}{Numeric vector, strictly positive.} @@ -24,9 +24,7 @@ Avoids boundary bias of the kernel MRL near t=0 by smoothing on the log-transformed scale. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \keyword{internal} diff --git a/r-package/morie/man/fzmrl.Rd b/r-package/morie/man/fzmrl.Rd index f6aa4d477f..d9da2aefd9 100644 --- a/r-package/morie/man/fzmrl.Rd +++ b/r-package/morie/man/fzmrl.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzmrl.R \name{fzmrl} \alias{fzmrl} -\alias{fauzi_mrl_asymptotic} +\alias{morie_fauzi_mrl_asymptotic} \title{Fauzi: Kernel mean residual life asymptotics (Ch 4)} \usage{ fzmrl(x, t = NULL, h = NULL) -fauzi_mrl_asymptotic(x, t = NULL, h = NULL) +morie_fauzi_mrl_asymptotic(x, t = NULL, h = NULL) } \arguments{ \item{x}{Numeric vector (lifetimes).} @@ -24,9 +24,6 @@ Kernel-smoothed MRL \eqn{m(t)=E[X-t|X>t]} with Yang (1978) asymptotic SE. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzmrl(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzqnt.Rd b/r-package/morie/man/fzqnt.Rd index 577ad4fc79..480f37fb12 100644 --- a/r-package/morie/man/fzqnt.Rd +++ b/r-package/morie/man/fzqnt.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzqnt.R \name{fzqnt} \alias{fzqnt} -\alias{fauzi_kernel_quantile_asymptotic} +\alias{morie_fauzi_kernel_quantile_asymptotic} \title{Fauzi: Asymptotic distribution of kernel quantile (Ch 3)} \usage{ fzqnt(x, p = 0.5, h = NULL) -fauzi_kernel_quantile_asymptotic(x, p = 0.5, h = NULL) +morie_fauzi_kernel_quantile_asymptotic(x, p = 0.5, h = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -23,9 +23,6 @@ Named list with estimate, se, p, h, density_at_Q, n, method. \eqn{\sqrt n(\hat Q-Q) \to N(0,p(1-p)/f(Q)^2)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzqnt(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzsgn.Rd b/r-package/morie/man/fzsgn.Rd index 59a81c7973..8be54796dd 100644 --- a/r-package/morie/man/fzsgn.Rd +++ b/r-package/morie/man/fzsgn.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzsgn.R \name{fzsgn} \alias{fzsgn} -\alias{fauzi_smoothed_sign} +\alias{morie_fauzi_smoothed_sign} \title{Fauzi: Smoothed sign test (Ch 5)} \usage{ fzsgn(x, theta0 = 0, h = NULL, alternative = "two-sided") -fauzi_smoothed_sign(x, theta0 = 0, h = NULL, alternative = "two-sided") +morie_fauzi_smoothed_sign(x, theta0 = 0, h = NULL, alternative = "two-sided") } \arguments{ \item{x}{Numeric vector.} @@ -26,9 +26,6 @@ Named list with statistic, z, p_value, theta0, h, n, method. z = (S_n - n/2)/sqrt(n/4) ~ N(0,1). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzsgn(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzsrv.Rd b/r-package/morie/man/fzsrv.Rd index 6fda71a3c1..fb0525d6b4 100644 --- a/r-package/morie/man/fzsrv.Rd +++ b/r-package/morie/man/fzsrv.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/fzsrv.R \name{fzsrv} \alias{fzsrv} -\alias{fauzi_survival_kernel} +\alias{morie_fauzi_survival_kernel} \title{Fauzi: Kernel survival function estimator (Ch 4)} \usage{ fzsrv(x, t = NULL, h = NULL) -fauzi_survival_kernel(x, t = NULL, h = NULL) +morie_fauzi_survival_kernel(x, t = NULL, h = NULL) } \arguments{ \item{x}{Numeric vector (lifetimes).} @@ -23,9 +23,6 @@ Named list with estimate, se, ci_lower, ci_upper, t, h, n, method. \eqn{\hat S_h(t)=1-\hat F_h(t)} with asymptotic 95\\% CI. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzsrv(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/fzwlc.Rd b/r-package/morie/man/fzwlc.Rd index 212bd20f51..bf0781835b 100644 --- a/r-package/morie/man/fzwlc.Rd +++ b/r-package/morie/man/fzwlc.Rd @@ -2,12 +2,17 @@ % Please edit documentation in R/fzwlc.R \name{fzwlc} \alias{fzwlc} -\alias{fauzi_smoothed_wilcoxon} +\alias{morie_fauzi_smoothed_wilcoxon} \title{Fauzi: Smoothed Wilcoxon signed-rank test (Ch 5)} \usage{ fzwlc(x, theta0 = 0, h = NULL, alternative = "two-sided") -fauzi_smoothed_wilcoxon(x, theta0 = 0, h = NULL, alternative = "two-sided") +morie_fauzi_smoothed_wilcoxon( + x, + theta0 = 0, + h = NULL, + alternative = "two-sided" +) } \arguments{ \item{x}{Numeric vector.} @@ -26,9 +31,6 @@ Named list with statistic, z, p_value, theta0, h, n, method. z = W_n / sqrt(n(n+1)(2n+1)/6) ~ N(0,1). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +fzwlc(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/generate_synthetic_data.Rd b/r-package/morie/man/generate_synthetic_data.Rd deleted file mode 100644 index d765f05570..0000000000 --- a/r-package/morie/man/generate_synthetic_data.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by morie generate_rd.py -\name{generate_synthetic_data} -\alias{generate_synthetic_data} -\title{Generate synthetic epidemiology-style data} -\description{ - Generate synthetic epidemiology-style tabular data - -Generates non-identifying synthetic data suitable for development, testing, -and demos. The generator uses a canonical variable set and allows output -column renaming through 'name_map' so it can be adapted to multiple studies. -Synthetic data should not be used for final inferential reporting. -} -\usage{ - generate_synthetic_data(n, seed, special_code_rate, profile, name_map) -} -\arguments{ - \item{n}{Number of rows.} - \item{seed}{Random seed for reproducibility.} - \item{special_code_rate}{Proportion of values replaced with survey-style} - \item{profile}{Convenience profile for output naming; ignored when} - \item{name_map}{Optional named character vector mapping canonical keys to} -} -\value{ - A data.frame with synthetic records. -} -\examples{ -df <- generate_synthetic_data(n = 500, seed = 2026) -nrow(df); dim(df) -head(df[, 1:5]) -} diff --git a/r-package/morie/man/gpfit.Rd b/r-package/morie/man/gpfit.Rd index dde7da6fdd..0d52b1d6dc 100644 --- a/r-package/morie/man/gpfit.Rd +++ b/r-package/morie/man/gpfit.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/gpfit.R \name{gpfit} \alias{gpfit} -\alias{generalized_pareto} +\alias{morie_generalized_pareto} \title{Generalised Pareto fit (POT) by ML (Pickands 1975)} \usage{ gpfit(x, threshold = NULL) -generalized_pareto(x, threshold = NULL) +morie_generalized_pareto(x, threshold = NULL) } \arguments{ \item{x}{numeric vector of raw observations.} diff --git a/r-package/morie/man/gwreg.Rd b/r-package/morie/man/gwreg.Rd index 07cad9b09b..25e93352cf 100644 --- a/r-package/morie/man/gwreg.Rd +++ b/r-package/morie/man/gwreg.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/gwreg.R \name{gwreg} \alias{gwreg} -\alias{geographically_weighted_regression} +\alias{morie_geographically_weighted_regression} \title{Geographically weighted regression (GWR).} \usage{ gwreg(x, y, coords, bandwidth = NULL, kernel = "gaussian") -geographically_weighted_regression( +morie_geographically_weighted_regression( x, y, coords, @@ -35,10 +35,7 @@ Local WLS at each site i: \deqn{\beta(s_i) = (X^\top W(s_i) X)^{-1} X^\top W(s_i) y}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +gwreg(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Brunsdon, Fotheringham & Charlton (1996). diff --git a/r-package/morie/man/hedges_g.Rd b/r-package/morie/man/hedges_g.Rd deleted file mode 100644 index 3c7e2cc213..0000000000 --- a/r-package/morie/man/hedges_g.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by morie generate_rd.py -\name{hedges_g} -\alias{hedges_g} -\title{Hedges' g (bias-corrected Cohen's d)} -\description{ - Hedges' g (bias-corrected Cohen's d) -} -\value{ - Numeric Hedges' g. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/hrzb1.Rd b/r-package/morie/man/hrzb1.Rd index 24f043fd80..b63d1bac16 100644 --- a/r-package/morie/man/hrzb1.Rd +++ b/r-package/morie/man/hrzb1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzb1.R \name{hrzb1} \alias{hrzb1} -\alias{horowitz_binary_response} +\alias{morie_horowitz_binary_response} \title{Manski (1975) maximum-score estimator} \usage{ hrzb1(x, y) -horowitz_binary_response(x, y) +morie_horowitz_binary_response(x, y) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzb2.Rd b/r-package/morie/man/hrzb2.Rd index 0d6c3f2e92..5aa84367ad 100644 --- a/r-package/morie/man/hrzb2.Rd +++ b/r-package/morie/man/hrzb2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzb2.R \name{hrzb2} \alias{hrzb2} -\alias{horowitz_smoothed_maximum_score} +\alias{morie_horowitz_smoothed_maximum_score} \title{Horowitz (1992) smoothed maximum-score estimator} \usage{ hrzb2(x, y, bandwidth = NULL) -horowitz_smoothed_maximum_score(x, y, bandwidth = NULL) +morie_horowitz_smoothed_maximum_score(x, y, bandwidth = NULL) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzc1.Rd b/r-package/morie/man/hrzc1.Rd index 9e5ac5d6b0..698b767dda 100644 --- a/r-package/morie/man/hrzc1.Rd +++ b/r-package/morie/man/hrzc1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzc1.R \name{hrzc1} \alias{hrzc1} -\alias{horowitz_censored_regression} +\alias{morie_horowitz_censored_regression} \title{Powell (1984) censored LAD (CLAD)} \usage{ hrzc1(x, y, censor = 0) -horowitz_censored_regression(x, y, censor = 0) +morie_horowitz_censored_regression(x, y, censor = 0) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzd1.Rd b/r-package/morie/man/hrzd1.Rd index 1140be9823..d53a365244 100644 --- a/r-package/morie/man/hrzd1.Rd +++ b/r-package/morie/man/hrzd1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzd1.R \name{hrzd1} \alias{hrzd1} -\alias{horowitz_duration_model} +\alias{morie_horowitz_duration_model} \title{Cox partial-likelihood proportional-hazards estimator} \usage{ hrzd1(t, x, event) -horowitz_duration_model(t, x, event) +morie_horowitz_duration_model(t, x, event) } \arguments{ \item{t}{Numeric vector of observed event/censoring times.} diff --git a/r-package/morie/man/hrzi1.Rd b/r-package/morie/man/hrzi1.Rd index dec30275c9..710043f712 100644 --- a/r-package/morie/man/hrzi1.Rd +++ b/r-package/morie/man/hrzi1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzi1.R \name{hrzi1} \alias{hrzi1} -\alias{horowitz_index_model} +\alias{morie_horowitz_index_model} \title{Ichimura (1993) single-index model} \usage{ hrzi1(x, y, bandwidth = NULL) -horowitz_index_model(x, y, bandwidth = NULL) +morie_horowitz_index_model(x, y, bandwidth = NULL) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzi2.Rd b/r-package/morie/man/hrzi2.Rd index ddf48b045b..e90eb96d4e 100644 --- a/r-package/morie/man/hrzi2.Rd +++ b/r-package/morie/man/hrzi2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzi2.R \name{hrzi2} \alias{hrzi2} -\alias{horowitz_average_derivative} +\alias{morie_horowitz_average_derivative} \title{Density-weighted average derivative} \usage{ hrzi2(x, y, bandwidth = NULL) -horowitz_average_derivative(x, y, bandwidth = NULL) +morie_horowitz_average_derivative(x, y, bandwidth = NULL) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzk1.Rd b/r-package/morie/man/hrzk1.Rd index 2ecd71bae1..620e20cd56 100644 --- a/r-package/morie/man/hrzk1.Rd +++ b/r-package/morie/man/hrzk1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzk1.R \name{hrzk1} \alias{hrzk1} -\alias{horowitz_kernel_density} +\alias{morie_horowitz_kernel_density} \title{Kernel density estimator (Rosenblatt-Parzen)} \usage{ hrzk1(x, bandwidth = NULL, sample = NULL) -horowitz_kernel_density(x, bandwidth = NULL, sample = NULL) +morie_horowitz_kernel_density(x, bandwidth = NULL, sample = NULL) } \arguments{ \item{x}{Numeric evaluation points (or sample if \code{sample} is NULL).} diff --git a/r-package/morie/man/hrzk2.Rd b/r-package/morie/man/hrzk2.Rd index eba2ce9a85..6426cb850d 100644 --- a/r-package/morie/man/hrzk2.Rd +++ b/r-package/morie/man/hrzk2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzk2.R \name{hrzk2} \alias{hrzk2} -\alias{horowitz_kernel_regression} +\alias{morie_horowitz_kernel_regression} \title{Nadaraya-Watson kernel regression} \usage{ hrzk2(x, y, bandwidth = NULL, grid = NULL) -horowitz_kernel_regression(x, y, bandwidth = NULL, grid = NULL) +morie_horowitz_kernel_regression(x, y, bandwidth = NULL, grid = NULL) } \arguments{ \item{x}{Numeric covariate vector.} diff --git a/r-package/morie/man/hrzk3.Rd b/r-package/morie/man/hrzk3.Rd index 7995bdcd29..06bca3dff6 100644 --- a/r-package/morie/man/hrzk3.Rd +++ b/r-package/morie/man/hrzk3.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzk3.R \name{hrzk3} \alias{hrzk3} -\alias{horowitz_local_linear} +\alias{morie_horowitz_local_linear} \title{Local-linear regression estimator} \usage{ hrzk3(x, y, bandwidth = NULL, grid = NULL) -horowitz_local_linear(x, y, bandwidth = NULL, grid = NULL) +morie_horowitz_local_linear(x, y, bandwidth = NULL, grid = NULL) } \arguments{ \item{x}{Numeric covariate vector.} diff --git a/r-package/morie/man/hrzm1.Rd b/r-package/morie/man/hrzm1.Rd index 9ba9203754..370a5d4a9b 100644 --- a/r-package/morie/man/hrzm1.Rd +++ b/r-package/morie/man/hrzm1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzm1.R \name{hrzm1} \alias{hrzm1} -\alias{horowitz_mixture_model} +\alias{morie_horowitz_mixture_model} \title{k-component Gaussian mixture EM} \usage{ hrzm1(y, k = 2, maxit = 200, tol = 1e-06, seed = 0) -horowitz_mixture_model(y, k = 2, maxit = 200, tol = 1e-06, seed = 0) +morie_horowitz_mixture_model(y, k = 2, maxit = 200, tol = 1e-06, seed = 0) } \arguments{ \item{y}{Numeric vector of observations.} diff --git a/r-package/morie/man/hrzn1.Rd b/r-package/morie/man/hrzn1.Rd index 1c494419b3..6360a8b7ad 100644 --- a/r-package/morie/man/hrzn1.Rd +++ b/r-package/morie/man/hrzn1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzn1.R \name{hrzn1} \alias{hrzn1} -\alias{horowitz_nonparametric_iv} +\alias{morie_horowitz_nonparametric_iv} \title{Series-Tikhonov nonparametric instrumental variables} \usage{ hrzn1(x, y, z, J = 5, alpha = 0.001, grid = NULL, .bootstrap = TRUE) -horowitz_nonparametric_iv( +morie_horowitz_nonparametric_iv( x, y, z, diff --git a/r-package/morie/man/hrzn2.Rd b/r-package/morie/man/hrzn2.Rd index 2601e13a75..7981da25eb 100644 --- a/r-package/morie/man/hrzn2.Rd +++ b/r-package/morie/man/hrzn2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzn2.R \name{hrzn2} \alias{hrzn2} -\alias{horowitz_deconvolution} +\alias{morie_horowitz_deconvolution} \title{Stefanski-Carroll Fourier-deconvolution density estimator} \usage{ hrzn2(y, sigma_u = 0.5, bandwidth = NULL, grid = NULL, noise = "laplace") -horowitz_deconvolution( +morie_horowitz_deconvolution( y, sigma_u = 0.5, bandwidth = NULL, diff --git a/r-package/morie/man/hrzp1.Rd b/r-package/morie/man/hrzp1.Rd index 66dd5b7069..50de9fb8f2 100644 --- a/r-package/morie/man/hrzp1.Rd +++ b/r-package/morie/man/hrzp1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzp1.R \name{hrzp1} \alias{hrzp1} -\alias{horowitz_plr_estimator} +\alias{morie_horowitz_plr_estimator} \title{Robinson partially-linear regression} \usage{ hrzp1(x, y, z, bandwidth = NULL) -horowitz_plr_estimator(x, y, z, bandwidth = NULL) +morie_horowitz_plr_estimator(x, y, z, bandwidth = NULL) } \arguments{ \item{x}{Numeric parametric covariate vector or matrix.} diff --git a/r-package/morie/man/hrzp2.Rd b/r-package/morie/man/hrzp2.Rd index 987d396525..b2a0c47ff3 100644 --- a/r-package/morie/man/hrzp2.Rd +++ b/r-package/morie/man/hrzp2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzp2.R \name{hrzp2} \alias{hrzp2} -\alias{horowitz_plr_bandwidth} +\alias{morie_horowitz_plr_bandwidth} \title{Silverman bandwidth selector for PLR} \usage{ hrzp2(x, y, c = 1.06) -horowitz_plr_bandwidth(x, y, c = 1.06) +morie_horowitz_plr_bandwidth(x, y, c = 1.06) } \arguments{ \item{x}{Numeric vector.} diff --git a/r-package/morie/man/hrzq1.Rd b/r-package/morie/man/hrzq1.Rd index 2877d1aad5..7bce328ad3 100644 --- a/r-package/morie/man/hrzq1.Rd +++ b/r-package/morie/man/hrzq1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzq1.R \name{hrzq1} \alias{hrzq1} -\alias{horowitz_quantile_regression} +\alias{morie_horowitz_quantile_regression} \title{Koenker-Bassett linear quantile regression} \usage{ hrzq1(x, y, tau = 0.5) -horowitz_quantile_regression(x, y, tau = 0.5) +morie_horowitz_quantile_regression(x, y, tau = 0.5) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzs1.Rd b/r-package/morie/man/hrzs1.Rd index eeee7ebad6..ee4b24a07d 100644 --- a/r-package/morie/man/hrzs1.Rd +++ b/r-package/morie/man/hrzs1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzs1.R \name{hrzs1} \alias{hrzs1} -\alias{horowitz_sample_selection} +\alias{morie_horowitz_sample_selection} \title{Heckman-Powell-Newey-Vella semiparametric sample-selection} \usage{ hrzs1(x, y, z, d) -horowitz_sample_selection(x, y, z, d) +morie_horowitz_sample_selection(x, y, z, d) } \arguments{ \item{x}{Numeric outcome covariates.} diff --git a/r-package/morie/man/hrzt1.Rd b/r-package/morie/man/hrzt1.Rd index 73efd5238c..f51bb2b94d 100644 --- a/r-package/morie/man/hrzt1.Rd +++ b/r-package/morie/man/hrzt1.Rd @@ -2,12 +2,18 @@ % Please edit documentation in R/hrzt1.R \name{hrzt1} \alias{hrzt1} -\alias{horowitz_treatment_effect} +\alias{morie_horowitz_treatment_effect} \title{Heckman-Ichimura-Todd kernel-matching ATE} \usage{ hrzt1(x, y, treatment, bandwidth = NULL, .bootstrap = TRUE) -horowitz_treatment_effect(x, y, treatment, bandwidth = NULL, .bootstrap = TRUE) +morie_horowitz_treatment_effect( + x, + y, + treatment, + bandwidth = NULL, + .bootstrap = TRUE +) } \arguments{ \item{x}{Numeric covariate vector or matrix.} diff --git a/r-package/morie/man/hrzt2.Rd b/r-package/morie/man/hrzt2.Rd index a9e1fbf518..da8761711f 100644 --- a/r-package/morie/man/hrzt2.Rd +++ b/r-package/morie/man/hrzt2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzt2.R \name{hrzt2} \alias{hrzt2} -\alias{horowitz_local_ate} +\alias{morie_horowitz_local_ate} \title{IV Wald estimator for LATE (Imbens-Angrist)} \usage{ hrzt2(x, y, z, treatment) -horowitz_local_ate(x, y, z, treatment) +morie_horowitz_local_ate(x, y, z, treatment) } \arguments{ \item{x}{Numeric covariates (unused at present; kept for API parity).} diff --git a/r-package/morie/man/hrzw1.Rd b/r-package/morie/man/hrzw1.Rd index 046e80e590..83c5311451 100644 --- a/r-package/morie/man/hrzw1.Rd +++ b/r-package/morie/man/hrzw1.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzw1.R \name{hrzw1} \alias{hrzw1} -\alias{horowitz_wild_bootstrap} +\alias{morie_horowitz_wild_bootstrap} \title{Rademacher wild bootstrap for OLS coefficients} \usage{ hrzw1(x, y, residuals = NULL, B = 500, seed = 0) -horowitz_wild_bootstrap(x, y, residuals = NULL, B = 500, seed = 0) +morie_horowitz_wild_bootstrap(x, y, residuals = NULL, B = 500, seed = 0) } \arguments{ \item{x}{Numeric covariate vector or design matrix.} diff --git a/r-package/morie/man/hrzw2.Rd b/r-package/morie/man/hrzw2.Rd index fe6791512d..f1e5c75d84 100644 --- a/r-package/morie/man/hrzw2.Rd +++ b/r-package/morie/man/hrzw2.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/hrzw2.R \name{hrzw2} \alias{hrzw2} -\alias{horowitz_bandwidth_bootstrap} +\alias{morie_horowitz_bandwidth_bootstrap} \title{Wild-bootstrap MISE bandwidth selection for NW regression} \usage{ hrzw2(x, y, B = 50, n_h = 15, seed = 0) -horowitz_bandwidth_bootstrap(x, y, B = 50, n_h = 15, seed = 0) +morie_horowitz_bandwidth_bootstrap(x, y, B = 50, n_h = 15, seed = 0) } \arguments{ \item{x}{Numeric covariate vector.} diff --git a/r-package/morie/man/idlpt.Rd b/r-package/morie/man/idlpt.Rd index f558173c81..60e055f438 100644 --- a/r-package/morie/man/idlpt.Rd +++ b/r-package/morie/man/idlpt.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/idlpt.R \name{idlpt} \alias{idlpt} -\alias{ideal_point_recovery} -\alias{ideal_point_model} +\alias{morie_ideal_point_recovery} +\alias{morie_ideal_point_model} \title{Ideal point recovery from unfolding configuration (Armstrong Ch 2)} \usage{ idlpt(X_r, X_s = NULL) -ideal_point_recovery(X_r, X_s = NULL) +morie_ideal_point_recovery(X_r, X_s = NULL) -ideal_point_model(X_r, X_s = NULL) +morie_ideal_point_model(X_r, X_s = NULL) } \arguments{ \item{X_r}{Row (respondent) coordinates (n by k matrix or vector).} @@ -28,9 +28,7 @@ unfolding solution. By the unfolding definition the respondent position is the ideal point (Eq 4.36 in Armstrong et al. 2014). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \keyword{internal} diff --git a/r-package/morie/man/impsm.Rd b/r-package/morie/man/impsm.Rd index 584f491c41..81938989d6 100644 --- a/r-package/morie/man/impsm.Rd +++ b/r-package/morie/man/impsm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/impsm.R \name{impsm} \alias{impsm} -\alias{importance_sampling} +\alias{morie_importance_sampling} \title{Importance sampling (Geweke 1989)} \usage{ impsm(x, h = NULL, p = NULL, q = NULL) -importance_sampling(x, h = NULL, p = NULL, q = NULL) +morie_importance_sampling(x, h = NULL, p = NULL, q = NULL) } \arguments{ \item{x}{numeric draws from q.} diff --git a/r-package/morie/man/indkr.Rd b/r-package/morie/man/indkr.Rd index 4fe7bedcaf..66ed82a754 100644 --- a/r-package/morie/man/indkr.Rd +++ b/r-package/morie/man/indkr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/indkr.R \name{indkr} \alias{indkr} -\alias{indicator_kriging} +\alias{morie_indicator_kriging} \title{Indicator kriging for exceedance probability.} \usage{ indkr(x, coords, threshold, target = NULL, nugget = 0, sill = 0.25, range_ = 1) -indicator_kriging( +morie_indicator_kriging( x, coords, threshold, @@ -36,10 +36,7 @@ Encode I_i = 1(x_i <= threshold) and ordinary-krige the indicator field at each target s0 to obtain P_hat(Z(s0) <= threshold). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +indkr(x = rnorm(50), coords = matrix(runif(100), 50, 2), threshold = 0.5) } \references{ Journel (1983); Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/irtsp.Rd b/r-package/morie/man/irtsp.Rd index 9ef0e9c4d8..15d6c4c15f 100644 --- a/r-package/morie/man/irtsp.Rd +++ b/r-package/morie/man/irtsp.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/irtsp.R \name{irtsp} \alias{irtsp} -\alias{irt_spatial} +\alias{morie_irt_spatial} \title{IRT 2PL spatial ideal-point model (Armstrong Ch 4)} \usage{ irtsp(x, n_iter = 60L, tol = 1e-06) -irt_spatial(x, n_iter = 60L, tol = 1e-06) +morie_irt_spatial(x, n_iter = 60L, tol = 1e-06) } \arguments{ \item{x}{Binary roll-call matrix (n by m).} @@ -25,9 +25,6 @@ Joint-MLE alternating updates for Clinton-Jackman-Rivers 2PL. Standardises latent x to mean 0, SD 1 every sweep for identification. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +irtsp(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/isotn.Rd b/r-package/morie/man/isotn.Rd index c74e06659d..02f6f5e05a 100644 --- a/r-package/morie/man/isotn.Rd +++ b/r-package/morie/man/isotn.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/isotn.R \name{isotn} \alias{isotn} -\alias{isotonic_regression} +\alias{morie_isotonic_regression} \title{Isotonic regression via PAVA (Barlow et al. 1972)} \usage{ isotn(x, y, weights = NULL, increasing = TRUE) -isotonic_regression(x, y, weights = NULL, increasing = TRUE) +morie_isotonic_regression(x, y, weights = NULL, increasing = TRUE) } \arguments{ \item{x}{numeric predictor.} diff --git a/r-package/morie/man/jackknife_estimate.Rd b/r-package/morie/man/jackknife_estimate.Rd deleted file mode 100644 index 348b58a0e4..0000000000 --- a/r-package/morie/man/jackknife_estimate.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{jackknife_estimate} -\alias{jackknife_estimate} -\title{Delete-1 jackknife variance estimate} -\description{ - Delete-1 jackknife variance estimate -} -\usage{ - jackknife_estimate(df, statistic) -} -\arguments{ - \item{df}{A data frame.} - \item{statistic}{A function taking a data frame and returning a scalar.} -} -\value{ - Named list: 'estimate', 'se', 'bias'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/jkest.Rd b/r-package/morie/man/jkest.Rd index b70ce1086c..94e5d5769f 100644 --- a/r-package/morie/man/jkest.Rd +++ b/r-package/morie/man/jkest.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/jkest.R \name{jkest} \alias{jkest} -\alias{jackknife_estimator} +\alias{morie_jackknife_estimator} \title{Jackknife bias and variance (Quenouille 1956, Tukey 1958)} \usage{ jkest(x, statistic = NULL) -jackknife_estimator(x, statistic = NULL) +morie_jackknife_estimator(x, statistic = NULL) } \arguments{ \item{x}{numeric vector.} diff --git a/r-package/morie/man/kendall_tau.Rd b/r-package/morie/man/kendall_tau.Rd deleted file mode 100644 index 75d11b90c2..0000000000 --- a/r-package/morie/man/kendall_tau.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{kendall_tau} -\alias{kendall_tau} -\title{Kendall's tau-b} -\description{ - Kendall's tau-b -} -\usage{ - kendall_tau(x, y) -} -\arguments{ - \item{x}{Numeric vector.} - \item{y}{Numeric vector.} -} -\value{ - Named list: 'tau', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/kruskal_wallis_test.Rd b/r-package/morie/man/kruskal_wallis_test.Rd deleted file mode 100644 index 0a8994347b..0000000000 --- a/r-package/morie/man/kruskal_wallis_test.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{kruskal_wallis_test} -\alias{kruskal_wallis_test} -\title{Kruskal-Wallis non-parametric ANOVA} -\description{ - Kruskal-Wallis non-parametric ANOVA -} -\usage{ - kruskal_wallis_test(...) -} -\arguments{ - \item{...}{Numeric vectors, one per group.} -} -\value{ - Named list: 'H', 'df', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/latnh.Rd b/r-package/morie/man/latnh.Rd index d973955e7a..8a9ee61206 100644 --- a/r-package/morie/man/latnh.Rd +++ b/r-package/morie/man/latnh.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/latnh.R \name{latnh} \alias{latnh} -\alias{latin_hypercube} +\alias{morie_latin_hypercube} \title{Latin hypercube sampling (McKay, Beckman & Conover 1979)} \usage{ latnh(N = 100L, d = 1L, f = NULL, seed = 42L) -latin_hypercube(N = 100L, d = 1L, f = NULL, seed = 42L) +morie_latin_hypercube(N = 100L, d = 1L, f = NULL, seed = 42L) } \arguments{ \item{N}{integer; sample size (default 100).} diff --git a/r-package/morie/man/levene_test.Rd b/r-package/morie/man/levene_test.Rd deleted file mode 100644 index ec8e6d74ad..0000000000 --- a/r-package/morie/man/levene_test.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{levene_test} -\alias{levene_test} -\title{Levene test for equality of variances} -\description{ - Levene test for equality of variances -} -\usage{ - levene_test(...) -} -\arguments{ - \item{...}{Numeric vectors, one per group.} -} -\value{ - Named list: 'F', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/license_check.Rd b/r-package/morie/man/license_check.Rd index c15c3d6f88..e68c8e61a2 100644 --- a/r-package/morie/man/license_check.Rd +++ b/r-package/morie/man/license_check.Rd @@ -3,6 +3,12 @@ \name{license_check} \alias{license_check} \title{Runtime license-compatibility guard for morie} +\value{ +\code{morie_gpl_compatible_licenses()} returns a character vector +of GPL-compatible SPDX identifiers; \code{check_plugin_license()} returns +a logical (invisibly), signalling a warning or error when the supplied +licence is not GPL-compatible. +} \description{ R parity of \code{morie._license_check}. Exposes the FSF GPL-compatible licence list and a @@ -14,3 +20,6 @@ stronger guarantees see the companion userspace LSM-style daemon (\code{daemon/morie_lsm.py}) and the kernel companion module (\code{kernel-module/morie.c}). } +\examples{ +morie_gpl_compatible_licenses() +} diff --git a/r-package/morie/man/load_cpads_data.Rd b/r-package/morie/man/load_cpads_data.Rd deleted file mode 100644 index 51d28e988a..0000000000 --- a/r-package/morie/man/load_cpads_data.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{load_cpads_data} -\alias{load_cpads_data} -\title{Load the real CPADS CSV from this repository} -\description{ - Load the real CPADS CSV from this repository -} -\usage{ - load_cpads_data(cpads_csv) -} -\arguments{ - \item{cpads_csv}{Path to the CPADS CSV.} -} -\value{ - Canonicalized CPADS data frame. -} -\examples{ -\dontrun{ - # Reads a local CPADS CSV (no network). - cpads <- load_cpads_data("/path/to/cpads.csv") - nrow(cpads); names(cpads)[1:5] -} -} diff --git a/r-package/morie/man/longitudinal_sim.Rd b/r-package/morie/man/longitudinal_sim.Rd index dfd4056dce..1b2cf3ef66 100644 --- a/r-package/morie/man/longitudinal_sim.Rd +++ b/r-package/morie/man/longitudinal_sim.Rd @@ -3,6 +3,12 @@ \name{longitudinal_sim} \alias{longitudinal_sim} \title{Synchronised longitudinal-panel simulation (R parity)} +\value{ +The simulation callables return tidy longitudinal-panel +\code{data.frame}s; \code{morie_sync_rng()} returns an environment +exposing synchronised \code{rnorm}, \code{runif}, and \code{sample} +methods. +} \description{ Clean-room R parity of \code{morie.longitudinal_sim} for synchronised multivariate longitudinal-panel simulation. Implements SyncRNG, @@ -20,3 +26,6 @@ Toeplitz / compound-symmetric covariance --- are standard methods from Hamilton (1994) and Diggle, Liang, Zeger (1994), implemented here independently. } +\examples{ +rng <- morie_sync_rng(42) +} diff --git a/r-package/morie/man/mann_whitney_test.Rd b/r-package/morie/man/mann_whitney_test.Rd deleted file mode 100644 index 5620915fb3..0000000000 --- a/r-package/morie/man/mann_whitney_test.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{mann_whitney_test} -\alias{mann_whitney_test} -\title{Mann-Whitney U test (Wilcoxon rank-sum)} -\description{ - Mann-Whitney U test (Wilcoxon rank-sum) -} -\usage{ - mann_whitney_test(x1, x2, alternative) -} -\arguments{ - \item{x1}{Numeric vector (group 1).} - \item{x2}{Numeric vector (group 2).} - \item{alternative}{'"two.sided"', '"greater"', or '"less"'.} -} -\value{ - Named list: 'W', 'p_value', 'r' (effect size). -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/mcint_crude.Rd b/r-package/morie/man/mcint_crude.Rd index e22205c8b6..2b2e58290e 100644 --- a/r-package/morie/man/mcint_crude.Rd +++ b/r-package/morie/man/mcint_crude.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/mcint.R \name{mcint_crude} \alias{mcint_crude} -\alias{monte_carlo_integration} +\alias{morie_monte_carlo_integration} \title{Monte Carlo integration (Rubinstein 1981)} \usage{ mcint_crude(f, a = 0, b = 1, N = 1000L, seed = 42L) -monte_carlo_integration(f, a = 0, b = 1, N = 1000L, seed = 42L) +morie_monte_carlo_integration(f, a = 0, b = 1, N = 1000L, seed = 42L) } \arguments{ \item{f}{function on (a, b).} diff --git a/r-package/morie/man/mdspl.Rd b/r-package/morie/man/mdspl.Rd index 02964a8910..b10f4bf703 100644 --- a/r-package/morie/man/mdspl.Rd +++ b/r-package/morie/man/mdspl.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/mdspl.R \name{mdspl} \alias{mdspl} -\alias{mds_spatial_map} +\alias{morie_mds_spatial_map} \title{Classical MDS for spatial map of legislators (Armstrong Ch 7)} \usage{ mdspl(x, k = 2L) -mds_spatial_map(x, k = 2L) +morie_mds_spatial_map(x, k = 2L) } \arguments{ \item{x}{Either an (n by p) configuration matrix OR an (n by n) @@ -24,9 +24,6 @@ Torgerson double-centring followed by top-k eigen-decomposition. Reports Stress-1 = sqrt(sum (d - hat_d)^2 / sum d^2). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +mdspl(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/mdvtr.Rd b/r-package/morie/man/mdvtr.Rd index 75711ada26..fbbb8f58f7 100644 --- a/r-package/morie/man/mdvtr.Rd +++ b/r-package/morie/man/mdvtr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/mdvtr.R \name{mdvtr} \alias{mdvtr} -\alias{median_voter} +\alias{morie_median_voter} \title{Median voter theorem (Armstrong et al. Ch 2)} \usage{ mdvtr(x) -median_voter(x) +morie_median_voter(x) } \arguments{ \item{x}{Numeric vector of voter ideal points.} @@ -22,10 +22,7 @@ Black (1948) median-voter theorem: with single-peaked preferences in asymptotic SE 1.2533 * s/sqrt(n) for normal-like data. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +mdvtr(x = rnorm(50)) } \references{ Armstrong et al. (2014), Ch 2. diff --git a/r-package/morie/man/midranks.Rd b/r-package/morie/man/midranks.Rd index e6d74ddf3b..5b070aa4c3 100644 --- a/r-package/morie/man/midranks.Rd +++ b/r-package/morie/man/midranks.Rd @@ -17,8 +17,5 @@ Identical to \code{rank(x, ties.method = "average")} plus a tie- correction term \verb{sum t_j^3 - t_j} over tied groups. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +midranks(x = rnorm(50)) } diff --git a/r-package/morie/man/mnpbt.Rd b/r-package/morie/man/mnpbt.Rd index 170ce92b53..f49f622bd8 100644 --- a/r-package/morie/man/mnpbt.Rd +++ b/r-package/morie/man/mnpbt.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/mnpbt.R \name{mnpbt} \alias{mnpbt} -\alias{multinomial_probit_spatial} +\alias{morie_multinomial_probit_spatial} \title{Multinomial probit (spatial choice; Armstrong Ch 9)} \usage{ mnpbt(x, n_draws = 2000L, seed = 0L) -multinomial_probit_spatial(x, n_draws = 2000L, seed = 0L) +morie_multinomial_probit_spatial(x, n_draws = 2000L, seed = 0L) } \arguments{ \item{x}{Utility matrix (n_obs by n_alt).} @@ -26,9 +26,6 @@ errors over a deterministic utility matrix; binary case uses the closed-form Phi((U1-U0)/sqrt(2)). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +mnpbt(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/morie-package.Rd b/r-package/morie/man/morie-package.Rd index dd97f97db1..e82f124165 100644 --- a/r-package/morie/man/morie-package.Rd +++ b/r-package/morie/man/morie-package.Rd @@ -72,7 +72,9 @@ file in the source distribution): \emph{Solitary Confinement, Torture, and Canada's Structured Intervention Units.} Centre for Criminology and Sociolegal Studies, University of Toronto. -\url{https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf} +Available at the Centre for Criminology and Sociolegal +Studies web site: crimsl.utoronto.ca (file +TortureSolitarySIUsSprottDoob23Feb2021_0.pdf). \item Doob, A. N. and Sprott, J. B. (2020). \emph{Understanding the Operation of Correctional Service Canada's Structured Intervention Units: Some Preliminary diff --git a/r-package/morie/man/aniso.Rd b/r-package/morie/man/morie_aniso.Rd similarity index 75% rename from r-package/morie/man/aniso.Rd rename to r-package/morie/man/morie_aniso.Rd index 732f3513f3..4781312b66 100644 --- a/r-package/morie/man/aniso.Rd +++ b/r-package/morie/man/morie_aniso.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/aniso.R -\name{aniso} -\alias{aniso} -\alias{anisotropy_test} +\name{morie_aniso} +\alias{morie_aniso} +\alias{morie_anisotropy_test} \title{Anisotropy detection via Levene comparison of directional pair-difference distributions.} \usage{ -aniso(x, coords, n_dirs = 4, tol_deg = 22.5) +morie_aniso(x, coords, n_dirs = 4, tol_deg = 22.5) -anisotropy_test(x, coords, n_dirs = 4, tol_deg = 22.5) +morie_anisotropy_test(x, coords, n_dirs = 4, tol_deg = 22.5) } \arguments{ \item{x}{Numeric vector.} @@ -29,10 +29,7 @@ vector falls within \code{tol_deg} of the direction and apply Levene's test across direction groups. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_aniso(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Goovaerts (1997); Schabenberger & Gotway (2005), Ch 3. diff --git a/r-package/morie/man/morie_anova_one_way.Rd b/r-package/morie/man/morie_anova_one_way.Rd new file mode 100644 index 0000000000..dcdb88b3a4 --- /dev/null +++ b/r-package/morie/man/morie_anova_one_way.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_anova_one_way} +\alias{morie_anova_one_way} +\title{One-way ANOVA} +\usage{ +morie_anova_one_way(...) +} +\arguments{ +\item{...}{Numeric vectors, one per group.} +} +\value{ +Named list: \code{F}, \code{df_between}, \code{df_within}, \code{p_value}, +\code{morie_eta_squared}. +} +\description{ +One-way ANOVA +} +\examples{ +morie_anova_one_way(rnorm(30, 0), rnorm(30, 0.5), rnorm(30, 1)) +} diff --git a/r-package/morie/man/arch_in_mean.Rd b/r-package/morie/man/morie_arch_in_mean.Rd similarity index 67% rename from r-package/morie/man/arch_in_mean.Rd rename to r-package/morie/man/morie_arch_in_mean.Rd index dc63cc4b28..26dde2b20b 100644 --- a/r-package/morie/man/arch_in_mean.Rd +++ b/r-package/morie/man/morie_arch_in_mean.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/archm.R -\name{arch_in_mean} -\alias{arch_in_mean} +\name{morie_arch_in_mean} +\alias{morie_arch_in_mean} \title{ARCH(1)-in-mean model} \usage{ -arch_in_mean(x) +morie_arch_in_mean(x) } \arguments{ \item{x}{Numeric return series.} @@ -17,8 +17,5 @@ Named list with \code{mu, delta, omega, alpha, loglik, ARCH(1)-in-mean model } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_arch_in_mean(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_ask_percy.Rd b/r-package/morie/man/morie_ask_percy.Rd new file mode 100644 index 0000000000..80a8ca5a45 --- /dev/null +++ b/r-package/morie/man/morie_ask_percy.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/perseus.R +\name{morie_ask_percy} +\alias{morie_ask_percy} +\alias{morie_assistant_query} +\title{Query Perseus via Python} +\usage{ +morie_ask_percy( + question, + context = NULL, + python_bin = Sys.getenv("MORIE_PYTHON_BIN", "python3") +) + +morie_assistant_query( + question, + context = NULL, + python_bin = Sys.getenv("MORIE_PYTHON_BIN", "python3") +) +} +\arguments{ +\item{question}{User question.} + +\item{context}{Optional context string.} + +\item{python_bin}{Python executable to use. Defaults to \code{MORIE_PYTHON_BIN} or \code{python3}.} +} +\value{ +Agent text response. +} +\description{ +Query Perseus via Python +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} +\keyword{internal} diff --git a/r-package/morie/man/attnq_scaled_dot_product_attention.Rd b/r-package/morie/man/morie_attnq_scaled_dot_product_attention.Rd similarity index 67% rename from r-package/morie/man/attnq_scaled_dot_product_attention.Rd rename to r-package/morie/man/morie_attnq_scaled_dot_product_attention.Rd index 0dd6a8653c..b0179edd79 100644 --- a/r-package/morie/man/attnq_scaled_dot_product_attention.Rd +++ b/r-package/morie/man/morie_attnq_scaled_dot_product_attention.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/attnq.R -\name{attnq_scaled_dot_product_attention} -\alias{attnq_scaled_dot_product_attention} -\alias{scaled_dot_product_attention} +\name{morie_attnq_scaled_dot_product_attention} +\alias{morie_attnq_scaled_dot_product_attention} +\alias{morie_scaled_dot_product_attention} \title{Scaled dot-product attention} \usage{ -attnq_scaled_dot_product_attention(Q, K = NULL, V = NULL, mask = NULL) +morie_attnq_scaled_dot_product_attention(Q, K = NULL, V = NULL, mask = NULL) -scaled_dot_product_attention(Q, K = NULL, V = NULL, mask = NULL) +morie_scaled_dot_product_attention(Q, K = NULL, V = NULL, mask = NULL) } \arguments{ \item{Q}{Numeric matrix \code{(n_q, d_k)}.} @@ -29,10 +29,7 @@ R parity for \code{morie.fn.attnq.scaled_dot_product_attention}. \mathrm{softmax}\!\left(\tfrac{Q K^\top}{\sqrt{d_k}}\right) V} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_attnq_scaled_dot_product_attention(Q = matrix(rnorm(150), 50, 3)) } \references{ Vaswani et al. (2017), NeurIPS. diff --git a/r-package/morie/man/morie_audit_public_outputs.Rd b/r-package/morie/man/morie_audit_public_outputs.Rd new file mode 100644 index 0000000000..cb7d08bf41 --- /dev/null +++ b/r-package/morie/man/morie_audit_public_outputs.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{morie_audit_public_outputs} +\alias{morie_audit_public_outputs} +\title{Audit declared outputs against files on disk} +\usage{ +morie_audit_public_outputs(project_root = NULL, manifest = NULL) +} +\arguments{ +\item{project_root}{Project root directory.} + +\item{manifest}{Manifest data frame. If \code{NULL}, loaded from disk.} +} +\value{ +Data frame containing declared and observed output status. +} +\description{ +Audit declared outputs against files on disk +} +\examples{ +# Craft a tempdir manifest + output file, then audit: +tdir <- tempfile("morie-doc-") +dir.create(tdir) +writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +man <- data.frame( + output = "results.csv", + public_path = file.path(tdir, "results.csv"), + size_kb = 0.01, modified = format(Sys.Date()) +) +morie_audit_public_outputs(project_root = tdir, manifest = man) +} diff --git a/r-package/morie/man/bayes_cpi_genomic.Rd b/r-package/morie/man/morie_bayes_cpi_genomic.Rd similarity index 85% rename from r-package/morie/man/bayes_cpi_genomic.Rd rename to r-package/morie/man/morie_bayes_cpi_genomic.Rd index 1360b87924..ac4f81a079 100644 --- a/r-package/morie/man/bayes_cpi_genomic.Rd +++ b/r-package/morie/man/morie_bayes_cpi_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/bglup.R -\name{bayes_cpi_genomic} -\alias{bayes_cpi_genomic} +\name{morie_bayes_cpi_genomic} +\alias{morie_bayes_cpi_genomic} \title{BayesC-pi spike-and-slab variable selection (short Gibbs)} \usage{ -bayes_cpi_genomic( +morie_bayes_cpi_genomic( x, y, n_iter = 300, @@ -39,10 +39,7 @@ list(estimate, beta, beta_pip, pi, sigma_b2, sigma2, n_iter, n, p, method). BayesC-pi spike-and-slab variable selection (short Gibbs) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bayes_cpi_genomic(x = rnorm(50), y = rnorm(50)) } \references{ Habier-Fernando-Kizilkaya-Garrick (2011); Montesinos Lopez Ch 4. diff --git a/r-package/morie/man/bayes_ridge_gibbs.Rd b/r-package/morie/man/morie_bayes_ridge_gibbs.Rd similarity index 85% rename from r-package/morie/man/bayes_ridge_gibbs.Rd rename to r-package/morie/man/morie_bayes_ridge_gibbs.Rd index 49c3a8295c..e30341ad5a 100644 --- a/r-package/morie/man/bayes_ridge_gibbs.Rd +++ b/r-package/morie/man/morie_bayes_ridge_gibbs.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/brdgf.R -\name{bayes_ridge_gibbs} -\alias{bayes_ridge_gibbs} +\name{morie_bayes_ridge_gibbs} +\alias{morie_bayes_ridge_gibbs} \title{BayesA via short Gibbs sampler (Meuwissen-Hayes-Goddard 2001)} \usage{ -bayes_ridge_gibbs( +morie_bayes_ridge_gibbs( x, y, n_iter = 200, @@ -42,10 +42,7 @@ list(estimate, beta, beta_se, sigma_j2, sigma2, n_iter, n, p, method). Per-marker variance with scaled inverse chi-squared prior. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bayes_ridge_gibbs(x = rnorm(50), y = rnorm(50)) } \references{ Meuwissen-Hayes-Goddard (2001) Genetics 157:1819. diff --git a/r-package/morie/man/bayesian_lasso_full.Rd b/r-package/morie/man/morie_bayesian_lasso_full.Rd similarity index 80% rename from r-package/morie/man/bayesian_lasso_full.Rd rename to r-package/morie/man/morie_bayesian_lasso_full.Rd index a69f7ac182..3ebc8f0c63 100644 --- a/r-package/morie/man/bayesian_lasso_full.Rd +++ b/r-package/morie/man/morie_bayesian_lasso_full.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/blasf.R -\name{bayesian_lasso_full} -\alias{bayesian_lasso_full} +\name{morie_bayesian_lasso_full} +\alias{morie_bayesian_lasso_full} \title{Bayesian LASSO (Park & Casella 2008 short Gibbs)} \usage{ -bayesian_lasso_full( +morie_bayesian_lasso_full( x, y, n_iter = 200, @@ -39,10 +39,11 @@ list(estimate, beta, intercept, se, beta_se, lam, sigma2, n_iter, n, p, method). Bayesian LASSO (Park & Casella 2008 short Gibbs) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bayesian_lasso_full( + x = matrix(rnorm(150), 50, 3), y = rnorm(50), + n_iter = 50L, burn = 10L, lam = 1, seed = 1L, + deterministic_seed = TRUE +) } \references{ Park & Casella (2008) JASA 103:681. Montesinos Lopez Ch 4. diff --git a/r-package/morie/man/bayesian_ridge_regression.Rd b/r-package/morie/man/morie_bayesian_ridge_regression.Rd similarity index 70% rename from r-package/morie/man/bayesian_ridge_regression.Rd rename to r-package/morie/man/morie_bayesian_ridge_regression.Rd index 973cb2c0ba..d3dec8f5ac 100644 --- a/r-package/morie/man/bayesian_ridge_regression.Rd +++ b/r-package/morie/man/morie_bayesian_ridge_regression.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/brreg.R -\name{bayesian_ridge_regression} -\alias{bayesian_ridge_regression} +\name{morie_bayesian_ridge_regression} +\alias{morie_bayesian_ridge_regression} \title{Bayesian ridge regression (RR-BLUP closed form)} \usage{ -bayesian_ridge_regression(x, y, lam = NULL) +morie_bayesian_ridge_regression(x, y, lam = NULL) } \arguments{ \item{x}{(n x p) marker matrix.} @@ -20,10 +20,7 @@ list(estimate, beta, intercept, se, beta_se, lam, n, p, method). beta_hat = solve(X'X + lambda\emph{I) \%}\% X'y } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bayesian_ridge_regression(x = rnorm(50), y = rnorm(50)) } \references{ Montesinos Lopez Ch 4. diff --git a/r-package/morie/man/bkprp_backpropagation.Rd b/r-package/morie/man/morie_bkprp_backpropagation.Rd similarity index 74% rename from r-package/morie/man/bkprp_backpropagation.Rd rename to r-package/morie/man/morie_bkprp_backpropagation.Rd index b9cb3083c9..405ccfa27c 100644 --- a/r-package/morie/man/bkprp_backpropagation.Rd +++ b/r-package/morie/man/morie_bkprp_backpropagation.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/bkprp.R -\name{bkprp_backpropagation} -\alias{bkprp_backpropagation} -\alias{backpropagation} +\name{morie_bkprp_backpropagation} +\alias{morie_bkprp_backpropagation} +\alias{morie_backpropagation} \title{Backpropagation gradient computation (1-layer MSE)} \usage{ -bkprp_backpropagation(x, y, w = NULL, b = NULL, activation = "sigmoid") +morie_bkprp_backpropagation(x, y, w = NULL, b = NULL, activation = "sigmoid") -backpropagation(x, y, w = NULL, b = NULL, activation = "sigmoid") +morie_backpropagation(x, y, w = NULL, b = NULL, activation = "sigmoid") } \arguments{ \item{x}{Numeric matrix \code{(batch, n_in)} or vector.} @@ -34,10 +34,7 @@ For \eqn{L = \tfrac{1}{2n}\sum (a - y)^2} with \delta = (a - y) \odot \sigma'(z)} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bkprp_backpropagation(x = rnorm(50), y = rnorm(50)) } \references{ Rumelhart, Hinton & Williams (1986); Goodfellow et al. (2016). diff --git a/r-package/morie/man/bnfwd_batch_norm_forward.Rd b/r-package/morie/man/morie_bnfwd_batch_norm_forward.Rd similarity index 67% rename from r-package/morie/man/bnfwd_batch_norm_forward.Rd rename to r-package/morie/man/morie_bnfwd_batch_norm_forward.Rd index 90816c72cb..3aac78c3fb 100644 --- a/r-package/morie/man/bnfwd_batch_norm_forward.Rd +++ b/r-package/morie/man/morie_bnfwd_batch_norm_forward.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/bnfwd.R -\name{bnfwd_batch_norm_forward} -\alias{bnfwd_batch_norm_forward} -\alias{batch_norm_forward} +\name{morie_bnfwd_batch_norm_forward} +\alias{morie_bnfwd_batch_norm_forward} +\alias{morie_batch_norm_forward} \title{Batch normalization forward pass} \usage{ -bnfwd_batch_norm_forward(x, gamma = NULL, beta = NULL, eps = 1e-05) +morie_bnfwd_batch_norm_forward(x, gamma = NULL, beta = NULL, eps = 1e-05) -batch_norm_forward(x, gamma = NULL, beta = NULL, eps = 1e-05) +morie_batch_norm_forward(x, gamma = NULL, beta = NULL, eps = 1e-05) } \arguments{ \item{x}{Numeric matrix shape \code{(batch, features)}.} @@ -28,10 +28,7 @@ R parity for \code{morie.fn.bnfwd.batch_norm_forward}. \deqn{y_i = \gamma\,\frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_bnfwd_batch_norm_forward(x = rnorm(50)) } \references{ Ioffe & Szegedy (2015), ICML. diff --git a/r-package/morie/man/morie_bootstrap_sample.Rd b/r-package/morie/man/morie_bootstrap_sample.Rd new file mode 100644 index 0000000000..016751c213 --- /dev/null +++ b/r-package/morie/man/morie_bootstrap_sample.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_bootstrap_sample} +\alias{morie_bootstrap_sample} +\title{Bootstrap resampling for any statistic} +\usage{ +morie_bootstrap_sample(df, statistic, n_bootstrap = 1000L, seed = 42L) +} +\arguments{ +\item{df}{A data frame.} + +\item{statistic}{A function taking a data frame and returning a scalar.} + +\item{n_bootstrap}{Number of bootstrap replicates.} + +\item{seed}{Random seed.} +} +\value{ +Named list: \code{estimate}, \code{se}, \code{ci_lower}, \code{ci_upper}, +\code{distribution} (numeric vector of bootstrap statistics). +} +\description{ +Bootstrap resampling for any statistic +} +\examples{ +df <- data.frame(x = rnorm(100)) +morie_bootstrap_sample(df, statistic = function(d) mean(d$x)) +} diff --git a/r-package/morie/man/morie_build_outputs_manifest.Rd b/r-package/morie/man/morie_build_outputs_manifest.Rd new file mode 100644 index 0000000000..9c1ac9cfb3 --- /dev/null +++ b/r-package/morie/man/morie_build_outputs_manifest.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{morie_build_outputs_manifest} +\alias{morie_build_outputs_manifest} +\title{Build an outputs manifest from a directory of artifacts} +\usage{ +morie_build_outputs_manifest( + output_dir, + manifest_path, + public_prefix = "data/manifest/outputs", + extensions = c("csv", "pdf", "png", "html", "txt", "md") +) +} +\arguments{ +\item{output_dir}{Directory containing output files.} + +\item{manifest_path}{CSV path to write.} + +\item{public_prefix}{Prefix used in \code{public_path} values.} + +\item{extensions}{File extensions to include (without dots).} +} +\value{ +Manifest data frame. +} +\description{ +Build an outputs manifest from a directory of artifacts +} +\examples{ +# Scan a tempdir of output files and build a manifest CSV: +tdir <- tempfile("morie-doc-") +dir.create(tdir) +writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +writeLines("# report", file.path(tdir, "report.md")) +morie_build_outputs_manifest(tdir, file.path(tdir, "outputs_manifest.csv")) +} diff --git a/r-package/morie/man/morie_build_prompt.Rd b/r-package/morie/man/morie_build_prompt.Rd new file mode 100644 index 0000000000..e42b7889ee --- /dev/null +++ b/r-package/morie/man/morie_build_prompt.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/perseus.R +\name{morie_build_prompt} +\alias{morie_build_prompt} +\alias{build_assistant_prompt} +\title{Build a Perseus agent prompt} +\usage{ +morie_build_prompt(question, context = NULL) + +build_assistant_prompt(question, context = NULL) +} +\arguments{ +\item{question}{User question.} + +\item{context}{Optional context string.} +} +\value{ +Character scalar prompt. +} +\description{ +Build a Perseus agent prompt +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} +\keyword{internal} diff --git a/r-package/morie/man/morie_builtin_db.Rd b/r-package/morie/man/morie_builtin_db.Rd index 7fa4d780cb..3788a7d495 100644 --- a/r-package/morie/man/morie_builtin_db.Rd +++ b/r-package/morie/man/morie_builtin_db.Rd @@ -16,8 +16,5 @@ CCS, CSADS, CSUS, HealthInfobase, and CIHI datasets pre-loaded as SQLite tables. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_builtin_db() } diff --git a/r-package/morie/man/morie_cache_clear.Rd b/r-package/morie/man/morie_cache_clear.Rd new file mode 100644 index 0000000000..02c4093a0f --- /dev/null +++ b/r-package/morie/man/morie_cache_clear.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/database.R +\name{morie_cache_clear} +\alias{morie_cache_clear} +\title{Clear morie's persistent cache directory} +\usage{ +morie_cache_clear(subdir = NULL, confirm = interactive()) +} +\arguments{ +\item{subdir}{Optional subdirectory under the morie cache root to +target (e.g. \code{"siu"}, \code{"tps"}). If \code{NULL}, removes +the entire morie persistent-cache root.} + +\item{confirm}{If \code{TRUE} (default in interactive sessions), +prompts the user before deleting. Set \code{FALSE} in scripts / +batch use to skip the prompt.} +} +\value{ +Invisibly, the number of files removed. +} +\description{ +Removes files cached by morie under +\code{tools::R_user_dir("morie", "cache")} (or +\code{MORIE_CACHE_DIR} if set). morie's default behaviour writes +caches to a session-scoped \code{\link[base]{tempdir}()} +subdirectory, so this function only matters if you have explicitly +opted in to persistent caching by passing +\code{cache_dir = morie_cache_dir(...)} to any of the morie +fetchers. +} +\examples{ +\donttest{ +# Non-interactive: skip the confirmation prompt. +morie_cache_clear("siu", confirm = FALSE) +} +} +\seealso{ +\code{\link{morie_cache_dir}} +} diff --git a/r-package/morie/man/morie_cache_dir.Rd b/r-package/morie/man/morie_cache_dir.Rd new file mode 100644 index 0000000000..71d54616df --- /dev/null +++ b/r-package/morie/man/morie_cache_dir.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/database.R +\name{morie_cache_dir} +\alias{morie_cache_dir} +\title{morie cache contract} +\usage{ +morie_cache_dir(subdir = NULL) +} +\arguments{ +\item{subdir}{Optional subdirectory under the morie cache root +(e.g. \code{"siu"}, \code{"tps"}). If \code{NULL}, the cache +root itself is returned.} +} +\value{ +A file path string. The directory is \emph{not} created; +callers create it lazily only when they actually persist to disk. +} +\description{ +morie functions that persist artifacts to disk (e.g. +\code{morie_fetch_siu(cache_html = TRUE)}) default to a +\emph{session-scoped} subdirectory of \code{\link[base]{tempdir}()}, +which R automatically removes when the session ends. This is the +most conservative CRAN-Policy-compliant default: nothing morie +writes ever survives the R session unless the user explicitly +opts in. +} +\details{ +Users who want \emph{persistent} caching across sessions opt in by +passing the result of \code{morie_cache_dir(subdir)} as the +\code{cache_dir} argument, e.g.: + +\preformatted{ + morie_fetch_siu( + cache_dir = morie_cache_dir("siu"), + cache_html = TRUE + ) +} + +The persistent location is \code{tools::R_user_dir("morie", "cache")} +(R \eqn{\ge} 4.0), which on Linux defaults to +\code{~/.cache/R/morie/}, on macOS to +\code{~/Library/Caches/org.R-project.R/R/morie/}, and on Windows to +\code{\%LOCALAPPDATA\%/R/cache/R/morie/}. Users can override this +location by setting the \code{MORIE_CACHE_DIR} environment variable +before calling \code{morie_cache_dir()}. + +\strong{Active management.} CRAN Policy requires persistent caches +to be actively managed. Use \code{\link{morie_cache_clear}()} to +empty the persistent cache (or a subdirectory of it). Cached SIU +HTML is ~80-100 MB at full sweep, so clearing it occasionally is +usually unnecessary, but it is supported. +} +\examples{ +# Persistent cache root (does not write anything to disk): +morie_cache_dir() +# Per-subsystem persistent path: +morie_cache_dir("siu") +} +\seealso{ +\code{\link{morie_cache_clear}} +} diff --git a/r-package/morie/man/morie_cache_file.Rd b/r-package/morie/man/morie_cache_file.Rd index 76e26452dc..95c2e131e7 100644 --- a/r-package/morie/man/morie_cache_file.Rd +++ b/r-package/morie/man/morie_cache_file.Rd @@ -4,14 +4,16 @@ \alias{morie_cache_file} \title{Cache local RDS/CSV data into the SQLite database} \usage{ -morie_cache_file(path, table_name, db_path = NULL) +morie_cache_file(path, table_name, db_path = NULL, con = NULL) } \arguments{ \item{path}{Path to a CSV or RDS file.} \item{table_name}{Name for the cached table.} -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite file (default backend).} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ Number of rows cached (invisible). @@ -21,8 +23,9 @@ Reads a local file and writes it to the cache so that CI and Docker environments (which may lack the original files) can still run tests. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +tdir <- tempfile("morie-cache-") +dir.create(tdir) +f <- file.path(tdir, "demo.csv") +write.csv(data.frame(x = 1:3, y = 4:6), f, row.names = FALSE) +morie_cache_file(f, "demo", db_path = file.path(tdir, "cache.db")) } diff --git a/r-package/morie/man/morie_cache_list.Rd b/r-package/morie/man/morie_cache_list.Rd index bc7b47c19b..06672033f7 100644 --- a/r-package/morie/man/morie_cache_list.Rd +++ b/r-package/morie/man/morie_cache_list.Rd @@ -4,10 +4,12 @@ \alias{morie_cache_list} \title{List all tables in the MORIE cache} \usage{ -morie_cache_list(db_path = NULL) +morie_cache_list(db_path = NULL, con = NULL) } \arguments{ -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite file (default backend).} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ A data.frame with columns \code{table} and \code{rows}. @@ -16,8 +18,10 @@ A data.frame with columns \code{table} and \code{rows}. List all tables in the MORIE cache } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") +\donttest{ +db <- tempfile(fileext = ".db") +morie_cache_store(data.frame(x = 1:3), "demo", db_path = db) +morie_cache_list(db_path = db) +file.remove(db) } } diff --git a/r-package/morie/man/morie_cache_load.Rd b/r-package/morie/man/morie_cache_load.Rd index 4cfb3b54a5..520b4175e2 100644 --- a/r-package/morie/man/morie_cache_load.Rd +++ b/r-package/morie/man/morie_cache_load.Rd @@ -4,12 +4,14 @@ \alias{morie_cache_load} \title{Load a table from the MORIE cache} \usage{ -morie_cache_load(table_name, db_path = NULL) +morie_cache_load(table_name, db_path = NULL, con = NULL) } \arguments{ -\item{table_name}{Name of the SQLite table.} +\item{table_name}{Name of the table.} -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite file (default backend).} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ A data.frame, or \code{NULL} if the table does not exist. @@ -18,8 +20,14 @@ A data.frame, or \code{NULL} if the table does not exist. Load a table from the MORIE cache } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") +\donttest{ +db <- tempfile(fileext = ".db") +morie_cache_store( + data = data.frame(x = 1:5), + table_name = "demo", + db_path = db +) +morie_cache_load(table_name = "demo", db_path = db) +file.remove(db) } } diff --git a/r-package/morie/man/morie_cache_store.Rd b/r-package/morie/man/morie_cache_store.Rd index 0554577e76..09625ec2c1 100644 --- a/r-package/morie/man/morie_cache_store.Rd +++ b/r-package/morie/man/morie_cache_store.Rd @@ -4,14 +4,18 @@ \alias{morie_cache_store} \title{Store a data frame in the MORIE cache} \usage{ -morie_cache_store(data, table_name, db_path = NULL) +morie_cache_store(data, table_name, db_path = NULL, con = NULL) } \arguments{ \item{data}{A data.frame to cache.} -\item{table_name}{Name of the SQLite table.} +\item{table_name}{Name of the destination table.} -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite file (default backend).} + +\item{con}{Optional pre-opened DBI connection. When supplied, the +table is written through \code{con} and \code{db_path} is ignored. Use this +for non-SQLite backends (PostgreSQL, DuckDB, MariaDB).} } \value{ Number of rows written (invisible). @@ -20,8 +24,13 @@ Number of rows written (invisible). Writes (or replaces) a table in the shared SQLite cache. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") +\donttest{ +db <- tempfile(fileext = ".db") +morie_cache_store( + data = data.frame(x = rnorm(50), y = rnorm(50)), + table_name = "demo", + db_path = db +) +file.remove(db) } } diff --git a/r-package/morie/man/calculate_ebac.Rd b/r-package/morie/man/morie_calculate_ebac.Rd similarity index 80% rename from r-package/morie/man/calculate_ebac.Rd rename to r-package/morie/man/morie_calculate_ebac.Rd index 3c36634dd4..294a8450b9 100644 --- a/r-package/morie/man/calculate_ebac.Rd +++ b/r-package/morie/man/morie_calculate_ebac.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ebac.R -\name{calculate_ebac} -\alias{calculate_ebac} +\name{morie_calculate_ebac} +\alias{morie_calculate_ebac} \title{Calculate estimated Blood Alcohol Concentration (eBAC)} \usage{ -calculate_ebac(drinks, weight_lbs, hours, gender_constant) +morie_calculate_ebac(drinks, weight_lbs, hours, gender_constant) } \arguments{ \item{drinks}{Number of standard drinks consumed (1 drink = 14 g alcohol).} @@ -29,5 +29,5 @@ where \eqn{r} is the gender constant (0.73 for men, 0.66 for women). Returned values are clipped at zero. } \examples{ -calculate_ebac(drinks = 4, weight_lbs = 180, hours = 2, gender_constant = 0.73) +morie_calculate_ebac(drinks = 4, weight_lbs = 180, hours = 2, gender_constant = 0.73) } diff --git a/r-package/morie/man/calculate_ipw_weights.Rd b/r-package/morie/man/morie_calculate_ipw_weights.Rd similarity index 89% rename from r-package/morie/man/calculate_ipw_weights.Rd rename to r-package/morie/man/morie_calculate_ipw_weights.Rd index 3ae329191c..5e72227046 100644 --- a/r-package/morie/man/calculate_ipw_weights.Rd +++ b/r-package/morie/man/morie_calculate_ipw_weights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ipw_weights.R -\name{calculate_ipw_weights} -\alias{calculate_ipw_weights} +\name{morie_calculate_ipw_weights} +\alias{morie_calculate_ipw_weights} \title{Calculate inverse probability of treatment weights (IPTW)} \usage{ -calculate_ipw_weights( +morie_calculate_ipw_weights( data, treatment, ps_col, @@ -46,6 +46,6 @@ df <- data.frame( t = rbinom(100, 1, 0.4), ps = pmin(pmax(runif(100, 0.05, 0.95), 0.05), 0.95) ) -w <- calculate_ipw_weights(df, treatment = "t", ps_col = "ps") +w <- morie_calculate_ipw_weights(df, treatment = "t", ps_col = "ps") summary(w) } diff --git a/r-package/morie/man/morie_calibration_weights.Rd b/r-package/morie/man/morie_calibration_weights.Rd new file mode 100644 index 0000000000..9137694aec --- /dev/null +++ b/r-package/morie/man/morie_calibration_weights.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_calibration_weights} +\alias{morie_calibration_weights} +\title{Calibration weights via iterative proportional fitting (raking)} +\usage{ +morie_calibration_weights( + df, + aux_vars, + population_totals, + initial_weights = NULL, + max_iter = 50L, + tol = 1e-06 +) +} +\arguments{ +\item{df}{A data frame.} + +\item{aux_vars}{Character vector of categorical auxiliary variable names.} + +\item{population_totals}{Named list: \code{"var_level"} -> population count. +Keys should be \code{"varname_level"} (e.g. \code{"gender_female"}).} + +\item{initial_weights}{Optional numeric vector of starting weights.} + +\item{max_iter}{Maximum IPF iterations.} + +\item{tol}{Convergence tolerance.} +} +\value{ +Numeric vector of calibrated weights. +} +\description{ +Adjusts initial design weights so that weighted marginal totals match +known population totals for each auxiliary variable. +} +\examples{ +set.seed(1) +df <- data.frame( + region = sample(c("A", "B"), 100, TRUE), + sex = sample(c("M", "F"), 100, TRUE) +) +totals <- list(region_A = 60, region_B = 40, sex_M = 55, sex_F = 45) +morie_calibration_weights(df, + aux_vars = c("region", "sex"), + population_totals = totals +) +} diff --git a/r-package/morie/man/morie_canonicalize_cpads_data.Rd b/r-package/morie/man/morie_canonicalize_cpads_data.Rd new file mode 100644 index 0000000000..8e6323ac7f --- /dev/null +++ b/r-package/morie/man/morie_canonicalize_cpads_data.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/modules.R +\name{morie_canonicalize_cpads_data} +\alias{morie_canonicalize_cpads_data} +\title{Canonicalize raw CPADS PUMF columns} +\usage{ +morie_canonicalize_cpads_data(data) +} +\arguments{ +\item{data}{Raw CPADS data frame.} +} +\value{ +Data frame with canonical MORIE analysis columns. +} +\description{ +Canonicalize raw CPADS PUMF columns +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_check_plugin_license.Rd b/r-package/morie/man/morie_check_plugin_license.Rd index d85e84f2ea..9f08f0f999 100644 --- a/r-package/morie/man/morie_check_plugin_license.Rd +++ b/r-package/morie/man/morie_check_plugin_license.Rd @@ -23,7 +23,10 @@ Check whether a downstream package's SPDX is GPL-compatible \examples{ morie_check_plugin_license("MIT") \dontrun{ - morie_check_plugin_license("LicenseRef-Proprietary", - raise_on_incompatible = TRUE) +# The next call demonstrates the error path; runs only on +# explicit example() with run.dontrun = TRUE. +morie_check_plugin_license("LicenseRef-Proprietary", + raise_on_incompatible = TRUE +) } } diff --git a/r-package/morie/man/morie_chi_square_test.Rd b/r-package/morie/man/morie_chi_square_test.Rd new file mode 100644 index 0000000000..aec3a564b0 --- /dev/null +++ b/r-package/morie/man/morie_chi_square_test.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_chi_square_test} +\alias{morie_chi_square_test} +\title{Chi-square test of independence or goodness-of-fit} +\usage{ +morie_chi_square_test(observed, expected = NULL) +} +\arguments{ +\item{observed}{Observed counts (matrix for independence, vector for GOF).} + +\item{expected}{Expected counts for GOF (optional; uniform if NULL).} +} +\value{ +Named list: \code{chi_sq}, \code{df}, \code{p_value}, \code{morie_cramers_v}. +} +\description{ +Chi-square test of independence or goodness-of-fit +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_ckan_search.Rd b/r-package/morie/man/morie_ckan_search.Rd new file mode 100644 index 0000000000..2778f6b502 --- /dev/null +++ b/r-package/morie/man/morie_ckan_search.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_access.R +\name{morie_ckan_search} +\alias{morie_ckan_search} +\title{Search any CKAN open-data portal for datasets} +\usage{ +morie_ckan_search(query, portal = "open.canada.ca", rows = 25L, ...) +} +\arguments{ +\item{query}{Free-text search string.} + +\item{portal}{A known portal name (\code{"open.canada.ca"}, +\code{"data.ontario.ca"}, \code{"open.toronto.ca"}) or a full +CKAN base URL (e.g. \code{"https://catalogue.example.org"}).} + +\item{rows}{Maximum number of datasets to return (default 25).} + +\item{...}{Extra named CKAN \code{package_search} parameters +(e.g. \code{fq = "res_format:CSV"}, \code{sort = "metadata_modified desc"}).} +} +\value{ +A data.frame with one row per resource, columns: +\code{dataset_title}, \code{dataset_id}, \code{resource_id}, +\code{resource_name}, \code{format}, \code{datastore_active}, +\code{url}. Feed \code{resource_id} into +\code{morie_fetch_ckan(resource_id = ...)}. +} +\description{ +Wraps the CKAN \code{package_search} action so users can discover +datasets that are not in the built-in MORIE catalog and fetch them +through \code{\link{morie_fetch_ckan}} or \code{\link{morie_fetch}}. +} +\examples{ +\dontrun{ +hits <- morie_ckan_search("cannabis survey", portal = "open.canada.ca") +head(hits[, c("dataset_title", "resource_id", "format")]) +} +} +\seealso{ +\code{\link{morie_fetch_ckan}}, \code{\link{morie_fetch}} +} diff --git a/r-package/morie/man/morie_cluster_sample.Rd b/r-package/morie/man/morie_cluster_sample.Rd new file mode 100644 index 0000000000..a7626a34e0 --- /dev/null +++ b/r-package/morie/man/morie_cluster_sample.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_cluster_sample} +\alias{morie_cluster_sample} +\title{Two-stage cluster sampling} +\usage{ +morie_cluster_sample(df, cluster_col, n_clusters, seed = 42L) +} +\arguments{ +\item{df}{A data frame.} + +\item{cluster_col}{Name of the cluster identifier column.} + +\item{n_clusters}{Number of clusters to select.} + +\item{seed}{Random seed.} +} +\value{ +Data frame of selected units with \code{.weight} column. +} +\description{ +Randomly selects \code{n_clusters} clusters, then takes all units within +selected clusters. +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/cnn1d_conv1d_forward.Rd b/r-package/morie/man/morie_cnn1d_conv1d_forward.Rd similarity index 68% rename from r-package/morie/man/cnn1d_conv1d_forward.Rd rename to r-package/morie/man/morie_cnn1d_conv1d_forward.Rd index 222ac7c4a0..556e8601bf 100644 --- a/r-package/morie/man/cnn1d_conv1d_forward.Rd +++ b/r-package/morie/man/morie_cnn1d_conv1d_forward.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cnn1d.R -\name{cnn1d_conv1d_forward} -\alias{cnn1d_conv1d_forward} -\alias{conv1d_forward} +\name{morie_cnn1d_conv1d_forward} +\alias{morie_cnn1d_conv1d_forward} +\alias{morie_conv1d_forward} \title{1D convolution forward pass (cross-correlation, valid padding)} \usage{ -cnn1d_conv1d_forward(x, w, b = 0, stride = 1L, padding = 0L) +morie_cnn1d_conv1d_forward(x, w, b = 0, stride = 1L, padding = 0L) -conv1d_forward(x, w, b = 0, stride = 1L, padding = 0L) +morie_conv1d_forward(x, w, b = 0, stride = 1L, padding = 0L) } \arguments{ \item{x}{Numeric vector.} @@ -30,10 +30,7 @@ R parity for \code{morie.fn.cnn1d.conv1d_forward}. \deqn{y[i] = \sum_{k=0}^{K-1} w[k] x[i\,s + k] + b} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_cnn1d_conv1d_forward(x = rnorm(50), w = rnorm(3)) } \references{ Goodfellow et al. (2016), Deep Learning, Ch 9. diff --git a/r-package/morie/man/cnn2d_conv2d_forward.Rd b/r-package/morie/man/morie_cnn2d_conv2d_forward.Rd similarity index 69% rename from r-package/morie/man/cnn2d_conv2d_forward.Rd rename to r-package/morie/man/morie_cnn2d_conv2d_forward.Rd index ed35924600..a8663d4782 100644 --- a/r-package/morie/man/cnn2d_conv2d_forward.Rd +++ b/r-package/morie/man/morie_cnn2d_conv2d_forward.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cnn2d.R -\name{cnn2d_conv2d_forward} -\alias{cnn2d_conv2d_forward} -\alias{conv2d_forward} +\name{morie_cnn2d_conv2d_forward} +\alias{morie_cnn2d_conv2d_forward} +\alias{morie_conv2d_forward} \title{2D convolution forward pass (cross-correlation, single channel)} \usage{ -cnn2d_conv2d_forward(x, w, b = 0, stride = 1L, padding = 0L) +morie_cnn2d_conv2d_forward(x, w, b = 0, stride = 1L, padding = 0L) -conv2d_forward(x, w, b = 0, stride = 1L, padding = 0L) +morie_conv2d_forward(x, w, b = 0, stride = 1L, padding = 0L) } \arguments{ \item{x}{Numeric matrix \code{(H, W)}.} @@ -30,10 +30,7 @@ R parity for \code{morie.fn.cnn2d.conv2d_forward}. \deqn{y[i,j] = \sum_{m,n} w[m,n] x[i s + m, j s + n] + b} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_cnn2d_conv2d_forward(x = rnorm(50), w = rnorm(3)) } \references{ Goodfellow et al. (2016), Deep Learning, Ch 9. diff --git a/r-package/morie/man/cnn_genomic.Rd b/r-package/morie/man/morie_cnn_genomic.Rd similarity index 85% rename from r-package/morie/man/cnn_genomic.Rd rename to r-package/morie/man/morie_cnn_genomic.Rd index ce46e0953c..9d95e989f3 100644 --- a/r-package/morie/man/cnn_genomic.Rd +++ b/r-package/morie/man/morie_cnn_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cnnge.R -\name{cnn_genomic} -\alias{cnn_genomic} +\name{morie_cnn_genomic} +\alias{morie_cnn_genomic} \title{CNN genomic predictor (Conv1D + GAP + dense, base R)} \usage{ -cnn_genomic( +morie_cnn_genomic( x, y, markers, @@ -39,10 +39,7 @@ list(estimate, y_hat, W_conv, b_conv, W1, b1, w2, b2, se, n, method). CNN genomic predictor (Conv1D + GAP + dense, base R) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_cnn_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ Montesinos Lopez Ch 13. diff --git a/r-package/morie/man/morie_cohens_d.Rd b/r-package/morie/man/morie_cohens_d.Rd new file mode 100644 index 0000000000..9f65efbaae --- /dev/null +++ b/r-package/morie/man/morie_cohens_d.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_cohens_d} +\alias{morie_cohens_d} +\title{Cohen's d effect size} +\usage{ +morie_cohens_d(x1, x2, pooled = TRUE) +} +\arguments{ +\item{x1}{Numeric vector (group 1).} + +\item{x2}{Numeric vector (group 2).} + +\item{pooled}{Use pooled SD (default \code{TRUE}). If \code{FALSE}, uses \code{sd(x2)}.} +} +\value{ +Numeric Cohen's d. +} +\description{ +Cohen's d effect size +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_coherence.Rd b/r-package/morie/man/morie_coherence.Rd new file mode 100644 index 0000000000..16cd6efc92 --- /dev/null +++ b/r-package/morie/man/morie_coherence.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cohrc.R +\name{morie_coherence} +\alias{morie_coherence} +\title{Magnitude-squared morie_coherence between two time series} +\usage{ +morie_coherence(x, y, nperseg = NULL, fs = 1) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{y}{Numeric vector (same length).} + +\item{nperseg}{Segment length. Default n/4.} + +\item{fs}{Sampling frequency. Default 1.} +} +\value{ +Named list with \code{frequencies, morie_coherence, n_segments, + nperseg, fs, n, method}. +} +\description{ +Magnitude-squared morie_coherence between two time series +} +\examples{ +morie_coherence(x = rnorm(50), y = rnorm(50)) +} diff --git a/r-package/morie/man/compare_nested_logistic_models.Rd b/r-package/morie/man/morie_compare_nested_logistic_models.Rd similarity index 80% rename from r-package/morie/man/compare_nested_logistic_models.Rd rename to r-package/morie/man/morie_compare_nested_logistic_models.Rd index ffce6a1c34..07f96dbce7 100644 --- a/r-package/morie/man/compare_nested_logistic_models.Rd +++ b/r-package/morie/man/morie_compare_nested_logistic_models.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/investigation.R -\name{compare_nested_logistic_models} -\alias{compare_nested_logistic_models} +\name{morie_compare_nested_logistic_models} +\alias{morie_compare_nested_logistic_models} \title{Compare nested logistic-regression models via likelihood-ratio test} \usage{ -compare_nested_logistic_models( +morie_compare_nested_logistic_models( data, outcome, predictors_full, @@ -36,8 +36,9 @@ df <- data.frame( y = rbinom(200, 1, 0.4), x1 = rnorm(200), x2 = rnorm(200), x3 = rnorm(200) ) -compare_nested_logistic_models(df, +morie_compare_nested_logistic_models(df, outcome = "y", - predictors_full = c("x1", "x2", "x3"), - predictors_reduced = c("x1")) + predictors_full = c("x1", "x2", "x3"), + predictors_reduced = c("x1") +) } diff --git a/r-package/morie/man/morie_compute_design_weights.Rd b/r-package/morie/man/morie_compute_design_weights.Rd new file mode 100644 index 0000000000..ba0fc6d64f --- /dev/null +++ b/r-package/morie/man/morie_compute_design_weights.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_compute_design_weights} +\alias{morie_compute_design_weights} +\title{Compute inverse-probability design weights} +\usage{ +morie_compute_design_weights(df, strata_col, population_sizes) +} +\arguments{ +\item{df}{A data frame.} + +\item{strata_col}{Name of the stratification column.} + +\item{population_sizes}{Named integer vector: stratum level -> population size.} +} +\value{ +Numeric vector of design weights (same length as \code{nrow(df)}). +} +\description{ +Compute inverse-probability design weights +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/concordance_incomplete.Rd b/r-package/morie/man/morie_concordance_incomplete.Rd similarity index 76% rename from r-package/morie/man/concordance_incomplete.Rd rename to r-package/morie/man/morie_concordance_incomplete.Rd index 415c1e0229..5ca6d01107 100644 --- a/r-package/morie/man/concordance_incomplete.Rd +++ b/r-package/morie/man/morie_concordance_incomplete.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cncrd.R -\name{concordance_incomplete} -\alias{concordance_incomplete} +\name{morie_concordance_incomplete} +\alias{morie_concordance_incomplete} \title{Kendall's coefficient of concordance W (Gibbons Ch 12.5)} \usage{ -concordance_incomplete(x) +morie_concordance_incomplete(x) } \arguments{ \item{x}{Matrix (n objects rows x k rankers cols); NA = not ranked.} @@ -19,8 +19,5 @@ squared deviations of object rank-sums from their mean. Significance via chi-square approximation k(n-1) W ~ chi-square with n-1 df. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_concordance_incomplete(x = rnorm(50)) } diff --git a/r-package/morie/man/confusion_matrix_metrics.Rd b/r-package/morie/man/morie_confusion_matrix_metrics.Rd similarity index 72% rename from r-package/morie/man/confusion_matrix_metrics.Rd rename to r-package/morie/man/morie_confusion_matrix_metrics.Rd index 3d91e36c16..6f5a687547 100644 --- a/r-package/morie/man/confusion_matrix_metrics.Rd +++ b/r-package/morie/man/morie_confusion_matrix_metrics.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/confm.R -\name{confusion_matrix_metrics} -\alias{confusion_matrix_metrics} +\name{morie_confusion_matrix_metrics} +\alias{morie_confusion_matrix_metrics} \title{Confusion matrix with precision / recall / F1 (R parity)} \usage{ -confusion_matrix_metrics(y_true, y_pred, labels = NULL) +morie_confusion_matrix_metrics(y_true, y_pred, labels = NULL) } \arguments{ \item{y_true}{Observed labels.} @@ -23,8 +23,5 @@ Manually constructs the confusion matrix to avoid the caret dependency for what is fundamentally a tabulation. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_confusion_matrix_metrics(y_true = rbinom(50, 1, 0.5), y_pred = rbinom(50, 1, 0.5)) } diff --git a/r-package/morie/man/contingency_coefficient.Rd b/r-package/morie/man/morie_contingency_coefficient.Rd similarity index 61% rename from r-package/morie/man/contingency_coefficient.Rd rename to r-package/morie/man/morie_contingency_coefficient.Rd index 57ab3807f6..a25fab0695 100644 --- a/r-package/morie/man/contingency_coefficient.Rd +++ b/r-package/morie/man/morie_contingency_coefficient.Rd @@ -1,16 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cntgc.R -\name{contingency_coefficient} -\alias{contingency_coefficient} +\name{morie_contingency_coefficient} +\alias{morie_contingency_coefficient} \title{Pearson contingency coefficient C (Gibbons Ch 14.2.1)} \usage{ -contingency_coefficient(x) +morie_contingency_coefficient(x) } \arguments{ \item{x}{A 2-D contingency table of counts.} } \value{ -Named list: statistic (C), cramers_v, chi2, p_value, df, +Named list: statistic (C), morie_cramers_v, chi2, p_value, df, max_C, n. } \description{ @@ -18,8 +18,5 @@ C = sqrt(chi^2 / (chi^2 + n)). Also reports Cramer's V and the maximum attainable C = sqrt((min(r,c)-1)/min(r,c)). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_contingency_coefficient(x = matrix(sample(1:5, 50, TRUE), 10, 5)) } diff --git a/r-package/morie/man/control_comparison.Rd b/r-package/morie/man/morie_control_comparison.Rd similarity index 80% rename from r-package/morie/man/control_comparison.Rd rename to r-package/morie/man/morie_control_comparison.Rd index d1f6c17cae..d9abc66338 100644 --- a/r-package/morie/man/control_comparison.Rd +++ b/r-package/morie/man/morie_control_comparison.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ctrlc.R -\name{control_comparison} -\alias{control_comparison} +\name{morie_control_comparison} +\alias{morie_control_comparison} \title{Nonparametric many-to-one comparisons to a control (Gibbons Ch 10.7)} \usage{ -control_comparison( +morie_control_comparison( groups, control_index = 1L, adjust = c("bonferroni", "none") @@ -27,8 +27,5 @@ Mann-Whitney vs. control for each treatment group; Bonferroni- adjusted p-values by default. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_control_comparison(groups = list(rnorm(20), rnorm(20), rnorm(20))) } diff --git a/r-package/morie/man/control_median_test.Rd b/r-package/morie/man/morie_control_median_test.Rd similarity index 71% rename from r-package/morie/man/control_median_test.Rd rename to r-package/morie/man/morie_control_median_test.Rd index 7fe81ba60a..ddafbfef73 100644 --- a/r-package/morie/man/control_median_test.Rd +++ b/r-package/morie/man/morie_control_median_test.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ctmed.R -\name{control_median_test} -\alias{control_median_test} +\name{morie_control_median_test} +\alias{morie_control_median_test} \title{Mood's median (control-median) test (Gibbons Ch 6.5)} \usage{ -control_median_test(x, y) +morie_control_median_test(x, y) } \arguments{ \item{x}{Numeric vector (control).} @@ -19,8 +19,5 @@ Two-sample median test: contingency-table chi-square on the counts above/below the pooled-sample median. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_control_median_test(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_cpads_contract.Rd b/r-package/morie/man/morie_cpads_contract.Rd new file mode 100644 index 0000000000..e06d665cee --- /dev/null +++ b/r-package/morie/man/morie_cpads_contract.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipw.R +\name{morie_cpads_contract} +\alias{morie_cpads_contract} +\title{Return the canonical CPADS local-data contract} +\usage{ +morie_cpads_contract() +} +\value{ +Named list describing the expected local CPADS contract. +} +\description{ +Return the canonical CPADS local-data contract +} +\examples{ +morie_cpads_contract() +} diff --git a/r-package/morie/man/morie_cramers_v.Rd b/r-package/morie/man/morie_cramers_v.Rd new file mode 100644 index 0000000000..7d2bdaafe0 --- /dev/null +++ b/r-package/morie/man/morie_cramers_v.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_cramers_v} +\alias{morie_cramers_v} +\title{Cramer's V for categorical association} +\usage{ +morie_cramers_v(contingency_table) +} +\arguments{ +\item{contingency_table}{A numeric matrix of observed counts.} +} +\value{ +Numeric Cramer's V in the interval [0, 1]. +} +\description{ +Cramer's V for categorical association +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_dataset_catalog.Rd b/r-package/morie/man/morie_dataset_catalog.Rd index 541f47cd29..7aa2b35b7d 100644 --- a/r-package/morie/man/morie_dataset_catalog.Rd +++ b/r-package/morie/man/morie_dataset_catalog.Rd @@ -7,9 +7,11 @@ morie_dataset_catalog() } \value{ -A data.frame with 36 rows (one per dataset) and columns: +A data.frame with 44 rows (one per dataset) and columns: key, name, source, survey, year, format, type, large_file, -local_path, table_name, ckan_resource_id. +local_path, table_name, ckan_resource_id, download_url, zip_member. +The \code{download_url} / \code{zip_member} columns are empty for +datasets reachable through the SQLite cache or the CKAN datastore. } \description{ Returns a data.frame describing every dataset available through the @@ -26,6 +28,8 @@ cat <- morie_dataset_catalog() nrow(cat) head(cat[, c("key", "name", "source", "year")]) # Find Ontario carceral datasets: -cat[grepl("OTIS|Ontario", paste(cat$source, cat$survey)), - c("key", "year")] +cat[ + grepl("OTIS|Ontario", paste(cat$source, cat$survey)), + c("key", "year") +] } diff --git a/r-package/morie/man/morie_dataset_info.Rd b/r-package/morie/man/morie_dataset_info.Rd index 88ed9f657b..69b2160e1d 100644 --- a/r-package/morie/man/morie_dataset_info.Rd +++ b/r-package/morie/man/morie_dataset_info.Rd @@ -18,7 +18,8 @@ Get metadata for a single dataset \examples{ # Use a real catalog key (run `morie_dataset_catalog()$key` to list them): info <- morie_dataset_info("ocp21") -info$source; info$year +info$source +info$year # Fuzzy match works for partial / forgiving keys: morie_dataset_info("cpads")$key } diff --git a/r-package/morie/man/morie_db_connect.Rd b/r-package/morie/man/morie_db_connect.Rd index d80ebce4b3..814eba00f8 100644 --- a/r-package/morie/man/morie_db_connect.Rd +++ b/r-package/morie/man/morie_db_connect.Rd @@ -2,33 +2,48 @@ % Please edit documentation in R/database.R \name{morie_db_connect} \alias{morie_db_connect} -\title{Connect to the MORIE SQLite cache database} +\title{Connect to the MORIE cache database} \usage{ morie_db_connect(db_path = NULL) } \arguments{ -\item{db_path}{Path to the SQLite file. Defaults to the -\code{MORIE_CACHE_DB} env var, else \code{morie.db} in the -per-user cache directory.} +\item{db_path}{Optional path to a DuckDB (\code{*.duckdb}) or SQLite +(\code{*.db}) file. Defaults to the \code{MORIE_CACHE_DB} env var, +else \code{morie.duckdb} / \code{morie.db} in the per-user cache +directory.} } \value{ A DBI connection object. } \description{ -Opens (or creates) the shared cache at \code{morie.db} in the -per-user cache directory (\code{~/.cache/morie} or -\code{XDG_CACHE_HOME}). Both R (DBI/RSQLite) and Python (sqlite3) -read/write this same file. +Opens (or creates) the per-user cache database. The default backend +is \strong{DuckDB} — zero-config like SQLite, but vectorised + columnar, +so it handles the multi-GB-scale open-data PUMFs (TPS, CPADS bulk) +that morie ingests without breaking down on analytical queries. For +back-compat, an existing SQLite cache at \code{morie.db} is reused; if +duckdb is unavailable, falls back to SQLite. +} +\details{ +For non-default backends (PostgreSQL, MariaDB, MS SQL Server, ...), +construct your own DBI connection and pass it as \code{con} to the +\verb{morie_cache_*} and \code{morie_load_dataset} functions: + +\preformatted{ +con <- DBI::dbConnect(RPostgres::Postgres(), + host = "...", dbname = "morie", user = "...", password = "...") +morie_load_dataset("ocp21", con = con) +} } \examples{ \donttest{ - if (requireNamespace("DBI", quietly = TRUE) && - requireNamespace("RSQLite", quietly = TRUE)) { - tmp <- tempfile(fileext = ".db") - con <- morie_db_connect(db_path = tmp) - DBI::dbListTables(con) - DBI::dbDisconnect(con) - file.remove(tmp) - } +# DuckDB (default when 'duckdb' is installed); pass a '.db' path for SQLite. +if (requireNamespace("duckdb", quietly = TRUE) && + requireNamespace("DBI", quietly = TRUE)) { + tmp <- tempfile(fileext = ".duckdb") + con <- morie_db_connect(db_path = tmp) + DBI::dbListTables(con) + DBI::dbDisconnect(con) + file.remove(tmp) +} } } diff --git a/r-package/morie/man/dbscan_clustering.Rd b/r-package/morie/man/morie_dbscan_clustering.Rd similarity index 70% rename from r-package/morie/man/dbscan_clustering.Rd rename to r-package/morie/man/morie_dbscan_clustering.Rd index dd11a32588..21c14df8d7 100644 --- a/r-package/morie/man/dbscan_clustering.Rd +++ b/r-package/morie/man/morie_dbscan_clustering.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbscl.R -\name{dbscan_clustering} -\alias{dbscan_clustering} +\name{morie_dbscan_clustering} +\alias{morie_dbscan_clustering} \title{DBSCAN density-based clustering (R parity)} \usage{ -dbscan_clustering(x, eps = 0.5, min_samples = 5L, metric = "euclidean") +morie_dbscan_clustering(x, eps = 0.5, min_samples = 5L, metric = "euclidean") } \arguments{ \item{x}{Numeric matrix.} @@ -23,8 +23,5 @@ core_sample_indices, eps, min_samples, n, method. Wraps \code{dbscan::dbscan}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_dbscan_clustering(x = rnorm(50)) } diff --git a/r-package/morie/man/dcc_multivariate_garch.Rd b/r-package/morie/man/morie_dcc_multivariate_garch.Rd similarity index 69% rename from r-package/morie/man/dcc_multivariate_garch.Rd rename to r-package/morie/man/morie_dcc_multivariate_garch.Rd index 5df4973c82..50d9128114 100644 --- a/r-package/morie/man/dcc_multivariate_garch.Rd +++ b/r-package/morie/man/morie_dcc_multivariate_garch.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dccmd.R -\name{dcc_multivariate_garch} -\alias{dcc_multivariate_garch} +\name{morie_dcc_multivariate_garch} +\alias{morie_dcc_multivariate_garch} \title{DCC multivariate GARCH (Engle 2002)} \usage{ -dcc_multivariate_garch(x) +morie_dcc_multivariate_garch(x) } \arguments{ \item{x}{Numeric matrix of returns (T x k).} @@ -17,8 +17,5 @@ Named list with \code{a, b, unconditional_correlation, Two-step DCC(1,1) on a panel of return series. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_dcc_multivariate_garch(x = matrix(rnorm(150), 50, 3)) } diff --git a/r-package/morie/man/decision_tree_split.Rd b/r-package/morie/man/morie_decision_tree_split.Rd similarity index 75% rename from r-package/morie/man/decision_tree_split.Rd rename to r-package/morie/man/morie_decision_tree_split.Rd index a51a38865e..4c80747498 100644 --- a/r-package/morie/man/decision_tree_split.Rd +++ b/r-package/morie/man/morie_decision_tree_split.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dtrsp.R -\name{decision_tree_split} -\alias{decision_tree_split} +\name{morie_decision_tree_split} +\alias{morie_decision_tree_split} \title{Decision tree split (R parity)} \usage{ -decision_tree_split(x, y, criterion = "gini", max_depth = 30L, seed = 0L) +morie_decision_tree_split(x, y, criterion = "gini", max_depth = 30L, seed = 0L) } \arguments{ \item{x}{Numeric predictor matrix.} @@ -27,8 +27,5 @@ CART tree via \code{rpart::rpart}, returning the root split structure and feature importances. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_decision_tree_split(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/deep_learning_genomic.Rd b/r-package/morie/man/morie_deep_learning_genomic.Rd similarity index 82% rename from r-package/morie/man/deep_learning_genomic.Rd rename to r-package/morie/man/morie_deep_learning_genomic.Rd index 62446fbf8c..4fe2043577 100644 --- a/r-package/morie/man/deep_learning_genomic.Rd +++ b/r-package/morie/man/morie_deep_learning_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dlgen.R -\name{deep_learning_genomic} -\alias{deep_learning_genomic} +\name{morie_deep_learning_genomic} +\alias{morie_deep_learning_genomic} \title{Single-hidden-layer MLP genomic predictor (base R)} \usage{ -deep_learning_genomic( +morie_deep_learning_genomic( x, y, markers, @@ -45,10 +45,10 @@ list(estimate, y_hat, beta, W1, b1, w2, b2, se, n, method). Single-hidden-layer MLP genomic predictor (base R) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_deep_learning_genomic( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ Montesinos Lopez Ch 12. diff --git a/r-package/morie/man/morie_default_synthetic_name_map.Rd b/r-package/morie/man/morie_default_synthetic_name_map.Rd new file mode 100644 index 0000000000..64e6293d24 --- /dev/null +++ b/r-package/morie/man/morie_default_synthetic_name_map.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/synthetic.R +\name{morie_default_synthetic_name_map} +\alias{morie_default_synthetic_name_map} +\title{Default synthetic-data variable name map} +\usage{ +morie_default_synthetic_name_map(profile = c("generic", "morie_legacy")) +} +\arguments{ +\item{profile}{Name profile. \code{"generic"} is recommended for new projects. +\code{"morie_legacy"} reproduces previous EML legacy column names.} +} +\value{ +Named character vector. +} +\description{ +Returns a named character vector mapping canonical variable keys used by +\code{\link[=morie_generate_synthetic_data]{morie_generate_synthetic_data()}} to output column names. +} +\examples{ +morie_default_synthetic_name_map("generic") +} diff --git a/r-package/morie/man/morie_default_workflow_map.Rd b/r-package/morie/man/morie_default_workflow_map.Rd new file mode 100644 index 0000000000..2604646328 --- /dev/null +++ b/r-package/morie/man/morie_default_workflow_map.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/workflow.R +\name{morie_default_workflow_map} +\alias{morie_default_workflow_map} +\title{Default workflow step map} +\usage{ +morie_default_workflow_map() +} +\value{ +Named character vector. +} +\description{ +Returns the default named map of workflow steps to project script paths. +} +\examples{ +morie_default_workflow_map() +} diff --git a/r-package/morie/man/morie_design_effect.Rd b/r-package/morie/man/morie_design_effect.Rd new file mode 100644 index 0000000000..817e23eeb4 --- /dev/null +++ b/r-package/morie/man/morie_design_effect.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_design_effect} +\alias{morie_design_effect} +\title{Design effect (DEFF)} +\usage{ +morie_design_effect(weights) +} +\arguments{ +\item{weights}{Numeric vector of sampling weights.} +} +\value{ +Numeric design effect (= n / ESS). +} +\description{ +Design effect (DEFF) +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_det_rng.Rd b/r-package/morie/man/morie_det_rng.Rd index 916977a631..5a7627905c 100644 --- a/r-package/morie/man/morie_det_rng.Rd +++ b/r-package/morie/man/morie_det_rng.Rd @@ -38,8 +38,6 @@ loaded only when neither is available. In practice CRAN reverse dependencies of \code{morie} ship with at least one of the two. } \examples{ -\dontrun{ morie_det_rng("ksr07_bootstrap", 42L) -rnorm(5) # reproducible draws keyed by ("ksr07_bootstrap", 42) -} +rnorm(5) # reproducible draws keyed by ("ksr07_bootstrap", 42) } diff --git a/r-package/morie/man/morie_det_rng_sha_hex.Rd b/r-package/morie/man/morie_det_rng_sha_hex.Rd index 57744fdd77..16f594b642 100644 --- a/r-package/morie/man/morie_det_rng_sha_hex.Rd +++ b/r-package/morie/man/morie_det_rng_sha_hex.Rd @@ -20,8 +20,5 @@ identical hex digests for the same \verb{(name, seed)} pair before either RNG is even consulted. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_det_rng_sha_hex(name = "example", seed = 1L) } diff --git a/r-package/morie/man/diffu_diffusion_forward.Rd b/r-package/morie/man/morie_diffu_diffusion_forward.Rd similarity index 82% rename from r-package/morie/man/diffu_diffusion_forward.Rd rename to r-package/morie/man/morie_diffu_diffusion_forward.Rd index 4d40c74ad5..e24999b232 100644 --- a/r-package/morie/man/diffu_diffusion_forward.Rd +++ b/r-package/morie/man/morie_diffu_diffusion_forward.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/diffu.R -\name{diffu_diffusion_forward} -\alias{diffu_diffusion_forward} +\name{morie_diffu_diffusion_forward} +\alias{morie_diffu_diffusion_forward} \title{DDPM forward (noising) process} \usage{ -diffu_diffusion_forward( +morie_diffu_diffusion_forward( x0, t, betas = NULL, @@ -39,10 +39,8 @@ R parity for \code{morie.fn.diffu.diffusion_forward}. with linear \eqn{\beta} schedule from \code{1e-4} to \code{0.02}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Ho, Jain & Abbeel (2020), NeurIPS. diff --git a/r-package/morie/man/diffu_heat_diffusion.Rd b/r-package/morie/man/morie_diffu_heat_diffusion.Rd similarity index 71% rename from r-package/morie/man/diffu_heat_diffusion.Rd rename to r-package/morie/man/morie_diffu_heat_diffusion.Rd index fc44d20f78..4e0a60abeb 100644 --- a/r-package/morie/man/diffu_heat_diffusion.Rd +++ b/r-package/morie/man/morie_diffu_heat_diffusion.Rd @@ -1,13 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/diffu.R -\name{diffu_heat_diffusion} -\alias{diffu_heat_diffusion} -\alias{diffusion_forward} +\name{morie_diffu_heat_diffusion} +\alias{morie_diffu_heat_diffusion} +\alias{morie_diffusion_forward} \title{1D heat diffusion solver (explicit finite differences)} \usage{ -diffu_heat_diffusion(T0, alpha = 0.01, dx = 0.1, dt = 0.01, n_steps = 100L) +morie_diffu_heat_diffusion( + T0, + alpha = 0.01, + dx = 0.1, + dt = 0.01, + n_steps = 100L +) -diffusion_forward(T0, alpha = 0.01, dx = 0.1, dt = 0.01, n_steps = 100L) +morie_diffusion_forward(T0, alpha = 0.01, dx = 0.1, dt = 0.01, n_steps = 100L) } \arguments{ \item{T0}{Initial temperature profile (numeric vector, length >= 3).} @@ -30,10 +36,7 @@ update of \eqn{\partial_t T = \alpha \partial_x^2 T} with Dirichlet (fixed-endpoint) boundary conditions. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_diffu_heat_diffusion(T0 = rep(0, 10)) } \references{ Crank (1975), Mathematics of Diffusion. diff --git a/r-package/morie/man/morie_download_bootstrap.Rd b/r-package/morie/man/morie_download_bootstrap.Rd index 28ddece417..210f923bf9 100644 --- a/r-package/morie/man/morie_download_bootstrap.Rd +++ b/r-package/morie/man/morie_download_bootstrap.Rd @@ -4,7 +4,12 @@ \alias{morie_download_bootstrap} \title{Download bootstrap weight files from CKAN API} \usage{ -morie_download_bootstrap(survey = "all", limit = 32000L, db_path = NULL) +morie_download_bootstrap( + survey = "all", + limit = 32000L, + db_path = NULL, + con = NULL +) } \arguments{ \item{survey}{One of \code{"csads_2021"}, \code{"csads_2023"}, @@ -12,7 +17,9 @@ morie_download_bootstrap(survey = "all", limit = 32000L, db_path = NULL) \item{limit}{Max records per CKAN request (default 32000).} -\item{db_path}{Optional override for cache database path.} +\item{db_path}{Optional path to a SQLite/DuckDB file (default backend).} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ Invisibly, the number of CSV files successfully downloaded. @@ -22,8 +29,8 @@ Downloads large bootstrap weight CSVs that are too big to ship with the package. Data is cached in the user cache database for future use. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") +\donttest{ +# See the package vignettes for usage examples: +# vignette(package = "morie") } } diff --git a/r-package/morie/man/drpfw_dropout_forward.Rd b/r-package/morie/man/morie_drpfw_dropout_forward.Rd similarity index 81% rename from r-package/morie/man/drpfw_dropout_forward.Rd rename to r-package/morie/man/morie_drpfw_dropout_forward.Rd index c7e146638a..9cf277b9bc 100644 --- a/r-package/morie/man/drpfw_dropout_forward.Rd +++ b/r-package/morie/man/morie_drpfw_dropout_forward.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/drpfw.R -\name{drpfw_dropout_forward} -\alias{drpfw_dropout_forward} -\alias{dropout_forward} +\name{morie_drpfw_dropout_forward} +\alias{morie_drpfw_dropout_forward} +\alias{morie_dropout_forward} \title{Dropout forward pass (inverted)} \usage{ -drpfw_dropout_forward( +morie_drpfw_dropout_forward( x, p = 0.5, seed = 0L, @@ -13,7 +13,7 @@ drpfw_dropout_forward( deterministic_seed = NULL ) -dropout_forward( +morie_dropout_forward( x, p = 0.5, seed = 0L, @@ -45,10 +45,7 @@ R parity for \code{morie.fn.drpfw.dropout_forward}. \deqn{y = x \odot m / (1-p), \quad m \sim \mathrm{Bernoulli}(1-p)} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_drpfw_dropout_forward(x = rnorm(50)) } \references{ Srivastava et al. (2014), JMLR 15:1929-1958. diff --git a/r-package/morie/man/morie_e_value.Rd b/r-package/morie/man/morie_e_value.Rd new file mode 100644 index 0000000000..aba4ff3faf --- /dev/null +++ b/r-package/morie/man/morie_e_value.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_e_value} +\alias{morie_e_value} +\title{Compute E-value for unmeasured confounding} +\usage{ +morie_e_value(rr, rr_lower = NULL) +} +\arguments{ +\item{rr}{Risk ratio estimate (> 0). Supply > 1; if < 1, pass its reciprocal.} + +\item{rr_lower}{Lower bound of the 95\% CI (used to compute E-value for CI).} +} +\value{ +Named list: \code{morie_e_value}, \code{e_value_ci} (for the CI bound). +} +\description{ +The E-value quantifies the minimum strength of confounding association +needed to fully explain away an observed treatment effect: +\deqn{E = RR + \sqrt{RR \cdot (RR - 1)}} +} +\details{ +For a risk ratio \eqn{RR < 1}, use \eqn{1/RR} before applying the formula. +} +\examples{ +morie_e_value(rr = 3.9, rr_lower = 2.4) +} +\references{ +VanderWeele TJ, Ding P (2017). Sensitivity analysis in observational +research: introducing the E-value. \emph{Annals of Internal Medicine}, +167(4):268-274. +} diff --git a/r-package/morie/man/morie_effective_sample_size.Rd b/r-package/morie/man/morie_effective_sample_size.Rd new file mode 100644 index 0000000000..7192b6c3ca --- /dev/null +++ b/r-package/morie/man/morie_effective_sample_size.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_effective_sample_size} +\alias{morie_effective_sample_size} +\title{Kish effective sample size} +\usage{ +morie_effective_sample_size(weights) +} +\arguments{ +\item{weights}{Numeric vector of sampling weights.} +} +\value{ +Numeric ESS. +} +\description{ +Kish effective sample size +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/eg_coint.Rd b/r-package/morie/man/morie_eg_coint.Rd similarity index 73% rename from r-package/morie/man/eg_coint.Rd rename to r-package/morie/man/morie_eg_coint.Rd index 59e4e3f13e..f449f246a2 100644 --- a/r-package/morie/man/eg_coint.Rd +++ b/r-package/morie/man/morie_eg_coint.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/coitg.R -\name{eg_coint} -\alias{eg_coint} +\name{morie_eg_coint} +\alias{morie_eg_coint} \title{Engle-Granger two-step cointegration test} \usage{ -eg_coint(y1, y2, max_lag = NULL) +morie_eg_coint(y1, y2, max_lag = NULL) } \arguments{ \item{y1}{Numeric, first series.} @@ -20,8 +20,5 @@ Named list with \code{adf_statistic, p_value, beta, n, method}. Engle-Granger two-step cointegration test } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_eg_coint(y1 = rnorm(100), y2 = rnorm(100)) } diff --git a/r-package/morie/man/egarch_model.Rd b/r-package/morie/man/morie_egarch_model.Rd similarity index 70% rename from r-package/morie/man/egarch_model.Rd rename to r-package/morie/man/morie_egarch_model.Rd index 4481146a79..49d836d4b2 100644 --- a/r-package/morie/man/egarch_model.Rd +++ b/r-package/morie/man/morie_egarch_model.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/egrch.R -\name{egarch_model} -\alias{egarch_model} +\name{morie_egarch_model} +\alias{morie_egarch_model} \title{EGARCH(1,1) asymmetric volatility model} \usage{ -egarch_model(x) +morie_egarch_model(x) } \arguments{ \item{x}{Numeric return series.} @@ -17,8 +17,5 @@ Named list with \code{omega, alpha, gamma, beta, loglik, EGARCH(1,1) asymmetric volatility model } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_egarch_model(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_estimate_aipw.Rd b/r-package/morie/man/morie_estimate_aipw.Rd new file mode 100644 index 0000000000..e9449aac3e --- /dev/null +++ b/r-package/morie/man/morie_estimate_aipw.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_aipw} +\alias{morie_estimate_aipw} +\title{Augmented IPW (AIPW) doubly-robust ATE estimator} +\usage{ +morie_estimate_aipw( + data, + treatment, + outcome, + covariates, + propensity_col = NULL, + outcome_model = c("linear", "logistic") +) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} + +\item{outcome_model}{Family for the outcome model: \code{"linear"} or \code{"logistic"}.} +} +\value{ +Named list: \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n}. +} +\description{ +Combines IPW and outcome regression corrections. Consistent if +\strong{either} the propensity model \strong{or} the outcome model is correctly +specified. +} +\examples{ +set.seed(1) +df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +morie_estimate_aipw(df, "t", "y", "x") +} diff --git a/r-package/morie/man/morie_estimate_atc.Rd b/r-package/morie/man/morie_estimate_atc.Rd new file mode 100644 index 0000000000..77b9c3ec60 --- /dev/null +++ b/r-package/morie/man/morie_estimate_atc.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_atc} +\alias{morie_estimate_atc} +\title{Estimate the Average Treatment Effect on the Controls (ATC)} +\usage{ +morie_estimate_atc(data, treatment, outcome, covariates, propensity_col = NULL) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} +} +\value{ +Named list: \code{atc}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n_control}. +} +\description{ +Control units receive weight 1; treated units receive +\eqn{w_i = (1-\hat{e}(X_i))/\hat{e}(X_i)}. +} +\examples{ +set.seed(1) +df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +morie_estimate_atc(df, "t", "y", "x") +} diff --git a/r-package/morie/man/morie_estimate_ate.Rd b/r-package/morie/man/morie_estimate_ate.Rd new file mode 100644 index 0000000000..b1869c6e45 --- /dev/null +++ b/r-package/morie/man/morie_estimate_ate.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_ate} +\alias{morie_estimate_ate} +\title{Estimate the Average Treatment Effect (ATE) via Hajek IPW} +\usage{ +morie_estimate_ate(data, treatment, outcome, covariates, propensity_col = NULL) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} +} +\value{ +Named list: \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n}, \code{ess}. +} +\description{ +The Hajek estimator uses stabilised IPW weights: +\deqn{\widehat{ATE} = \bar{y}_1^{w} - \bar{y}_0^{w}} +where \eqn{\bar{y}_t^{w} = \sum_{T_i=t} w_i Y_i / \sum_{T_i=t} w_i} +and \eqn{w_i = T_i/\hat{e}(X_i) + (1-T_i)/(1-\hat{e}(X_i))}. +} +\examples{ +set.seed(1) +df <- data.frame( + t = rbinom(200, 1, 0.4), + y = rnorm(200), + x = rnorm(200) +) +morie_estimate_ate(df, "t", "y", "x") +} diff --git a/r-package/morie/man/morie_estimate_att.Rd b/r-package/morie/man/morie_estimate_att.Rd new file mode 100644 index 0000000000..117db81ec2 --- /dev/null +++ b/r-package/morie/man/morie_estimate_att.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_att} +\alias{morie_estimate_att} +\title{Estimate the Average Treatment Effect on the Treated (ATT)} +\usage{ +morie_estimate_att(data, treatment, outcome, covariates, propensity_col = NULL) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} +} +\value{ +Named list: \code{att}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n_treated}. +} +\description{ +Treated units receive weight 1; controls receive +\eqn{w_i = \hat{e}(X_i)/(1-\hat{e}(X_i))}. +} +\examples{ +set.seed(2) +df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +morie_estimate_att(df, "t", "y", "x") +} diff --git a/r-package/morie/man/morie_estimate_cate.Rd b/r-package/morie/man/morie_estimate_cate.Rd new file mode 100644 index 0000000000..b9672d8975 --- /dev/null +++ b/r-package/morie/man/morie_estimate_cate.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_cate} +\alias{morie_estimate_cate} +\title{Estimate per-unit Conditional Average Treatment Effects (CATE)} +\usage{ +morie_estimate_cate( + data, + treatment, + outcome, + covariates, + propensity_col = NULL, + outcome_model = c("linear", "logistic"), + meta_learner = c("t_learner", "s_learner") +) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} + +\item{outcome_model}{Family for the outcome model: \code{"linear"} or \code{"logistic"}.} + +\item{meta_learner}{\code{"t_learner"} (default) or \code{"s_learner"}.} +} +\value{ +Numeric vector of per-unit CATE estimates. +} +\description{ +The \strong{T-learner} fits separate outcome models on treated and control +units, then predicts the counterfactual for each unit: +\eqn{\widehat{CATE}_i = \hat{\mu}_1(X_i) - \hat{\mu}_0(X_i)}. +} +\details{ +The \strong{S-learner} fits one model with treatment as a feature. +} +\examples{ +morie_estimate_cate( + data = data.frame( + t = stats::rbinom(100, 1, 0.4), + y = stats::rbinom(100, 1, 0.3), x1 = stats::rnorm(100), + x2 = stats::rnorm(100) + ), treatment = "t", outcome = "y", + covariates = c("x1", "x2") +) +} diff --git a/r-package/morie/man/morie_estimate_g_computation.Rd b/r-package/morie/man/morie_estimate_g_computation.Rd new file mode 100644 index 0000000000..321333bed7 --- /dev/null +++ b/r-package/morie/man/morie_estimate_g_computation.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_g_computation} +\alias{morie_estimate_g_computation} +\title{G-computation (outcome regression) ATE estimator} +\usage{ +morie_estimate_g_computation( + data, + treatment, + outcome, + covariates, + outcome_model = c("linear", "logistic") +) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{outcome_model}{Family for the outcome model: \code{"linear"} or \code{"logistic"}.} +} +\value{ +Named list: \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}. +} +\description{ +Estimates the ATE by: +\deqn{\widehat{ATE} = \frac{1}{n}\sum_i \bigl[\hat{\mu}_1(X_i) - \hat{\mu}_0(X_i)\bigr]} +} +\examples{ +set.seed(1) +df <- data.frame(t = rbinom(200, 1, 0.4), y = rnorm(200), x = rnorm(200)) +morie_estimate_g_computation(df, "t", "y", "x") +} diff --git a/r-package/morie/man/morie_estimate_gate.Rd b/r-package/morie/man/morie_estimate_gate.Rd new file mode 100644 index 0000000000..ac0b1d402d --- /dev/null +++ b/r-package/morie/man/morie_estimate_gate.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_gate} +\alias{morie_estimate_gate} +\title{Estimate Group Average Treatment Effects (GATE)} +\usage{ +morie_estimate_gate( + data, + treatment, + outcome, + covariates, + group_col, + propensity_col = NULL, + outcome_model = c("linear", "logistic") +) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{covariates}{Character vector of covariate names.} + +\item{group_col}{Name of the grouping variable (e.g. \code{"gender"}).} + +\item{propensity_col}{Optional: name of a pre-computed propensity score column.} + +\item{outcome_model}{Family for the outcome model: \code{"linear"} or \code{"logistic"}.} +} +\value{ +Data frame with columns: \code{group}, \code{ate}, \code{se}, +\code{ci_lower}, \code{ci_upper}, \code{n}. +} +\description{ +Applies AIPW within each level of \code{group_col} to estimate +stratum-specific treatment effects. +} +\examples{ +set.seed(3) +df <- data.frame( + t = rbinom(300, 1, 0.4), + y = rnorm(300), + x = rnorm(300), + g = sample(c("A", "B"), 300, replace = TRUE) +) +morie_estimate_gate(df, "t", "y", "x", "g") +} diff --git a/r-package/morie/man/estimate_irm.Rd b/r-package/morie/man/morie_estimate_irm.Rd similarity index 90% rename from r-package/morie/man/estimate_irm.Rd rename to r-package/morie/man/morie_estimate_irm.Rd index e1cdfd81c6..cc1c978b1c 100644 --- a/r-package/morie/man/estimate_irm.Rd +++ b/r-package/morie/man/morie_estimate_irm.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/irm.R -\name{estimate_irm} -\alias{estimate_irm} +\name{morie_estimate_irm} +\alias{morie_estimate_irm} \title{Estimate the ATE via the Interactive Regression Model (IRM)} \usage{ -estimate_irm( +morie_estimate_irm( data, treatment, outcome, @@ -59,8 +59,8 @@ If any are unavailable, the function raises an informative error. \examples{ \donttest{ if (requireNamespace("DoubleML", quietly = TRUE) && - requireNamespace("mlr3", quietly = TRUE) && - requireNamespace("mlr3learners", quietly = TRUE)) { + requireNamespace("mlr3", quietly = TRUE) && + requireNamespace("mlr3learners", quietly = TRUE)) { set.seed(1) n <- 200 X <- matrix(rnorm(n * 5), n, 5) @@ -68,8 +68,10 @@ if (requireNamespace("DoubleML", quietly = TRUE) && T <- rbinom(n, 1, ps) Y <- 0.5 * T + X[, 1] + rnorm(n) df <- data.frame(Y = Y, T = T, X) - estimate_irm(df, treatment = "T", outcome = "Y", - covariates = paste0("X", 1:5)) + morie_estimate_irm(df, + treatment = "T", outcome = "Y", + covariates = paste0("X", 1:5) + ) } } } diff --git a/r-package/morie/man/morie_estimate_late.Rd b/r-package/morie/man/morie_estimate_late.Rd new file mode 100644 index 0000000000..38d0ad49b2 --- /dev/null +++ b/r-package/morie/man/morie_estimate_late.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_late} +\alias{morie_estimate_late} +\title{Estimate the Local Average Treatment Effect (LATE) via 2SLS / Wald} +\usage{ +morie_estimate_late(data, treatment, outcome, instrument, covariates = NULL) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary endogenous treatment column.} + +\item{outcome}{Name of the outcome column.} + +\item{instrument}{Name of the binary instrument column.} + +\item{covariates}{Optional character vector of exogenous covariates.} +} +\value{ +Named list: \code{late}, \code{se}, \code{ci_lower}, \code{ci_upper}, +\code{first_stage_f}, \code{n}. +} +\description{ +Uses a binary instrument \eqn{Z} to identify the LATE (Imbens & Angrist, 1994): +\deqn{LATE = \frac{Cov(Y, Z)}{Cov(T, Z)}} +} +\details{ +With covariates, uses two-stage OLS (Wald within residuals). +Requires \code{ivreg::ivreg()} if available; otherwise falls back to the +closed-form Wald estimator. +} +\examples{ +set.seed(1) +n <- 300L +z <- rbinom(n, 1, 0.5) +t <- rbinom(n, 1, plogis(-0.2 + 1.5 * z)) +y <- 0.8 * t + rnorm(n) +morie_estimate_late(data.frame(t = t, y = y, z = z), "t", "y", "z") +} +\references{ +Imbens GW, Angrist JD (1994). Identification and estimation of local +average treatment effects. \emph{Econometrica}, 62(2), 467-475. +} diff --git a/r-package/morie/man/morie_estimate_propensity_scores.Rd b/r-package/morie/man/morie_estimate_propensity_scores.Rd new file mode 100644 index 0000000000..8811f9b0bd --- /dev/null +++ b/r-package/morie/man/morie_estimate_propensity_scores.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_estimate_propensity_scores} +\alias{morie_estimate_propensity_scores} +\title{Estimate propensity scores via logistic regression} +\usage{ +morie_estimate_propensity_scores( + data, + treatment, + covariates, + trim = c(0.01, 0.99) +) +} +\arguments{ +\item{data}{A data frame.} + +\item{treatment}{Name of the binary treatment column.} + +\item{covariates}{Character vector of covariate names.} + +\item{trim}{Quantile pair used to winsorize extreme scores (default 0.01, 0.99).} +} +\value{ +Numeric vector of propensity scores (same length as \code{nrow(data)}). +} +\description{ +Estimate propensity scores via logistic regression +} +\examples{ +df <- data.frame(t = c(0, 1, 0, 1, 0, 1), x = rnorm(6)) +ps <- morie_estimate_propensity_scores(df, "t", "x") +} diff --git a/r-package/morie/man/morie_eta_squared.Rd b/r-package/morie/man/morie_eta_squared.Rd new file mode 100644 index 0000000000..430411526f --- /dev/null +++ b/r-package/morie/man/morie_eta_squared.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_eta_squared} +\alias{morie_eta_squared} +\title{Eta-squared from F-statistic} +\usage{ +morie_eta_squared(f_stat, df_between, df_within) +} +\arguments{ +\item{f_stat}{F statistic.} + +\item{df_between}{Degrees of freedom (numerator).} + +\item{df_within}{Degrees of freedom (denominator).} +} +\value{ +Numeric eta-squared. +} +\description{ +Eta-squared from F-statistic +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/ewma_volatility.Rd b/r-package/morie/man/morie_ewma_volatility.Rd similarity index 71% rename from r-package/morie/man/ewma_volatility.Rd rename to r-package/morie/man/morie_ewma_volatility.Rd index 0656f3d2a6..c2b2256e81 100644 --- a/r-package/morie/man/ewma_volatility.Rd +++ b/r-package/morie/man/morie_ewma_volatility.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ewtma.R -\name{ewma_volatility} -\alias{ewma_volatility} +\name{morie_ewma_volatility} +\alias{morie_ewma_volatility} \title{EWMA volatility (RiskMetrics 1996)} \usage{ -ewma_volatility(x, lambda = 0.94) +morie_ewma_volatility(x, lambda = 0.94) } \arguments{ \item{x}{Numeric return series.} @@ -19,8 +19,5 @@ Named list with \code{conditional_variance, conditional_volatility, EWMA volatility (RiskMetrics 1996) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ewma_volatility(x = rnorm(50)) } diff --git a/r-package/morie/man/fairness_average_odds_difference.Rd b/r-package/morie/man/morie_fairness_average_odds_difference.Rd similarity index 74% rename from r-package/morie/man/fairness_average_odds_difference.Rd rename to r-package/morie/man/morie_fairness_average_odds_difference.Rd index 2f9a2f91f2..71eafb4e2c 100644 --- a/r-package/morie/man/fairness_average_odds_difference.Rd +++ b/r-package/morie/man/morie_fairness_average_odds_difference.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_average_odds_difference} -\alias{fairness_average_odds_difference} +\name{morie_fairness_average_odds_difference} +\alias{morie_fairness_average_odds_difference} \title{Average Odds Difference} \usage{ -fairness_average_odds_difference( +morie_fairness_average_odds_difference( y_true, y_pred, group, @@ -36,9 +36,10 @@ AIF360 and the COMPAS \emph{XAI Stories} audit. } \examples{ truth <- c(1, 0, 1, 0, 1, 0, 1, 0) -pred <- c(1, 0, 1, 0, 1, 1, 0, 1) -race <- c(rep("A", 4), rep("B", 4)) -res <- fairness_average_odds_difference(truth, pred, race, - privileged = "A") -res$value # 0.25 +pred <- c(1, 0, 1, 0, 1, 1, 0, 1) +race <- c(rep("A", 4), rep("B", 4)) +res <- morie_fairness_average_odds_difference(truth, pred, race, + privileged = "A" +) +res$value # 0.25 } diff --git a/r-package/morie/man/fairness_bias_amplification.Rd b/r-package/morie/man/morie_fairness_bias_amplification.Rd similarity index 80% rename from r-package/morie/man/fairness_bias_amplification.Rd rename to r-package/morie/man/morie_fairness_bias_amplification.Rd index 598e0d3ed6..610ddd5525 100644 --- a/r-package/morie/man/fairness_bias_amplification.Rd +++ b/r-package/morie/man/morie_fairness_bias_amplification.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_bias_amplification} -\alias{fairness_bias_amplification} +\name{morie_fairness_bias_amplification} +\alias{morie_fairness_bias_amplification} \title{Bias Amplification Score (composite parity-gap times inequality)} \usage{ -fairness_bias_amplification(y_pred, group, privileged = NULL, favorable = 1) +morie_fairness_bias_amplification( + y_pred, + group, + privileged = NULL, + favorable = 1 +) } \arguments{ \item{y_pred}{Vector of decisions/assignments, one per individual.} @@ -35,6 +40,6 @@ Predictive Policing" (arXiv:2603.18987). \examples{ pred <- c(1, 1, 1, 1, 0, 0, 0, 0) race <- c(rep("A", 4), rep("B", 4)) -res <- fairness_bias_amplification(pred, race, privileged = "A") -res$value # -0.5 (parity gap -1.0 times Gini 0.5) +res <- morie_fairness_bias_amplification(pred, race, privileged = "A") +res$value # -0.5 (parity gap -1.0 times Gini 0.5) } diff --git a/r-package/morie/man/fairness_demographic_parity.Rd b/r-package/morie/man/morie_fairness_demographic_parity.Rd similarity index 75% rename from r-package/morie/man/fairness_demographic_parity.Rd rename to r-package/morie/man/morie_fairness_demographic_parity.Rd index e187ba0e72..e3084abfcd 100644 --- a/r-package/morie/man/fairness_demographic_parity.Rd +++ b/r-package/morie/man/morie_fairness_demographic_parity.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_demographic_parity} -\alias{fairness_demographic_parity} +\name{morie_fairness_demographic_parity} +\alias{morie_fairness_demographic_parity} \title{Demographic Parity Gap} \usage{ -fairness_demographic_parity(y_pred, group, privileged = NULL, favorable = 1) +morie_fairness_demographic_parity( + y_pred, + group, + privileged = NULL, + favorable = 1 +) } \arguments{ \item{y_pred}{Vector of decisions/assignments, one per individual.} @@ -29,6 +34,6 @@ group receives favourable outcomes at the same rate. \examples{ pred <- c(1, 1, 1, 1, 0, 0, 0, 1, 0, 0) race <- c(rep("A", 5), rep("B", 5)) -res <- fairness_demographic_parity(pred, race, privileged = "A") -res$value # -0.6 (group B rate 0.2 minus group A rate 0.8) +res <- morie_fairness_demographic_parity(pred, race, privileged = "A") +res$value # -0.6 (group B rate 0.2 minus group A rate 0.8) } diff --git a/r-package/morie/man/fairness_disparate_impact.Rd b/r-package/morie/man/morie_fairness_disparate_impact.Rd similarity index 75% rename from r-package/morie/man/fairness_disparate_impact.Rd rename to r-package/morie/man/morie_fairness_disparate_impact.Rd index 715dd8e6d7..665dade730 100644 --- a/r-package/morie/man/fairness_disparate_impact.Rd +++ b/r-package/morie/man/morie_fairness_disparate_impact.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_disparate_impact} -\alias{fairness_disparate_impact} +\name{morie_fairness_disparate_impact} +\alias{morie_fairness_disparate_impact} \title{Disparate Impact Ratio (EEOC four-fifths rule)} \usage{ -fairness_disparate_impact(y_pred, group, privileged = NULL, favorable = 1) +morie_fairness_disparate_impact( + y_pred, + group, + privileged = NULL, + favorable = 1 +) } \arguments{ \item{y_pred}{Vector of decisions/assignments, one per individual.} @@ -30,7 +35,7 @@ standard legal indicator of adverse impact. \examples{ pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0) race <- c(rep("A", 5), rep("B", 5)) -res <- fairness_disparate_impact(pred, race, privileged = "A") -res$value # 0.6 (group B rate 0.6 / group A rate 1.0) -res$adverse_impact # TRUE +res <- morie_fairness_disparate_impact(pred, race, privileged = "A") +res$value # 0.6 (group B rate 0.6 / group A rate 1.0) +res$adverse_impact # TRUE } diff --git a/r-package/morie/man/fairness_equalized_odds.Rd b/r-package/morie/man/morie_fairness_equalized_odds.Rd similarity index 78% rename from r-package/morie/man/fairness_equalized_odds.Rd rename to r-package/morie/man/morie_fairness_equalized_odds.Rd index 3a681dc596..f63233da12 100644 --- a/r-package/morie/man/fairness_equalized_odds.Rd +++ b/r-package/morie/man/morie_fairness_equalized_odds.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_equalized_odds} -\alias{fairness_equalized_odds} +\name{morie_fairness_equalized_odds} +\alias{morie_fairness_equalized_odds} \title{Equalized Odds (true- and false-positive-rate gaps)} \usage{ -fairness_equalized_odds( +morie_fairness_equalized_odds( y_true, y_pred, group, @@ -36,8 +36,8 @@ rates. } \examples{ truth <- c(1, 0, 1, 0, 1, 0, 1, 0) -pred <- c(1, 0, 1, 0, 1, 1, 0, 1) -race <- c(rep("A", 4), rep("B", 4)) -res <- fairness_equalized_odds(truth, pred, race, privileged = "A") -res$violation # TRUE +pred <- c(1, 0, 1, 0, 1, 1, 0, 1) +race <- c(rep("A", 4), rep("B", 4)) +res <- morie_fairness_equalized_odds(truth, pred, race, privileged = "A") +res$violation # TRUE } diff --git a/r-package/morie/man/fairness_gini.Rd b/r-package/morie/man/morie_fairness_gini.Rd similarity index 79% rename from r-package/morie/man/fairness_gini.Rd rename to r-package/morie/man/morie_fairness_gini.Rd index 28a5b81553..d352af8bdf 100644 --- a/r-package/morie/man/fairness_gini.Rd +++ b/r-package/morie/man/morie_fairness_gini.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/frns_metrics.R -\name{fairness_gini} -\alias{fairness_gini} +\name{morie_fairness_gini} +\alias{morie_fairness_gini} \title{Gini Coefficient (concentration / inequality of a distribution)} \usage{ -fairness_gini(values, group = NULL) +morie_fairness_gini(values, group = NULL) } \arguments{ \item{values}{Vector of non-negative quantities.} @@ -23,6 +23,6 @@ unequally a system concentrates its attention. With \code{group} supplied, a per-group Gini is also reported. } \examples{ -fairness_gini(c(5, 5, 5, 5))$value # 0 -fairness_gini(c(0, 0, 0, 100))$value # 0.75 +morie_fairness_gini(c(5, 5, 5, 5))$value # 0 +morie_fairness_gini(c(0, 0, 0, 100))$value # 0.75 } diff --git a/r-package/morie/man/morie_fast_available.Rd b/r-package/morie/man/morie_fast_available.Rd index 608b331b63..43ab884113 100644 --- a/r-package/morie/man/morie_fast_available.Rd +++ b/r-package/morie/man/morie_fast_available.Rd @@ -6,14 +6,15 @@ \usage{ morie_fast_available() } +\value{ +A logical scalar: \code{TRUE} when the compiled Rcpp backend was +built and loaded, \code{FALSE} when falling back to base-R kernels. +} \description{ Mirrors \code{morie.fast.is_jit_available()} on the Python side. Returns TRUE when the Rcpp .so was built and loaded; FALSE when falling back to base-R implementations. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_fast_available() } diff --git a/r-package/morie/man/morie_fetch.Rd b/r-package/morie/man/morie_fetch.Rd new file mode 100644 index 0000000000..e5ff69c591 --- /dev/null +++ b/r-package/morie/man/morie_fetch.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_access.R +\name{morie_fetch} +\alias{morie_fetch} +\title{Fetch a dataset from any URL, with automatic format detection} +\usage{ +morie_fetch( + url, + format = c("auto", "csv", "tsv", "json", "xml", "html", "xlsx", "zip", "arcgis"), + params = NULL, + zip_member = "", + simplify = TRUE, + ... +) +} +\arguments{ +\item{url}{The resource URL.} + +\item{format}{One of \code{"auto"} (default), \code{"csv"}, +\code{"tsv"}, \code{"json"}, \code{"xml"}, \code{"html"}, +\code{"xlsx"}, \code{"zip"}, \code{"arcgis"}.} + +\item{params}{Optional named list appended to \code{url} as a +URL-encoded query string.} + +\item{zip_member}{For \code{zip} downloads, the archive member to +extract (matched by basename, then by substring).} + +\item{simplify}{For \code{json}/\code{xml}/\code{html}, whether to +simplify into a data.frame where possible (default \code{TRUE}).} + +\item{...}{Passed to the underlying reader (e.g. \code{\link{read.csv}} +arguments, or \code{\link{morie_fetch_arcgis}} arguments).} +} +\value{ +A data.frame for tabular formats; a list or document object +for non-tabular \code{json}/\code{xml}/\code{html}. +} +\description{ +A universal data-access entry point. Given a URL, MORIE detects the +format from the HTTP \code{Content-Type} header (falling back to the +URL extension), downloads the resource, and parses it into an R +object. The behaviour is automatic by default but every step is +controllable: pass an explicit \code{format}, extra query +\code{params}, a \code{zip_member} to extract, or reader arguments +via \code{...}. +} +\details{ +Supported formats: \code{csv}, \code{tsv}, \code{json}, \code{xml}, +\code{html}, \code{xlsx}, \code{zip} (extract one member), and +\code{arcgis} (delegates to \code{\link{morie_fetch_arcgis}}). +} +\examples{ +\dontrun{ +# Examples use placeholder URLs (example.org). Replace with a +# real CSV / JSON endpoint when running. +df <- morie_fetch("https://example.org/data.csv") +js <- morie_fetch("https://api.example.org/records", + format = "json", params = list(limit = 100) +) +} +} +\seealso{ +\code{\link{morie_ckan_search}}, \code{\link{morie_fetch_arcgis}} +} diff --git a/r-package/morie/man/morie_fetch_arcgis.Rd b/r-package/morie/man/morie_fetch_arcgis.Rd new file mode 100644 index 0000000000..6620868083 --- /dev/null +++ b/r-package/morie/man/morie_fetch_arcgis.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_access.R +\name{morie_fetch_arcgis} +\alias{morie_fetch_arcgis} +\title{Query an ArcGIS FeatureServer / MapServer layer} +\usage{ +morie_fetch_arcgis( + layer_url, + where = "1=1", + out_fields = "*", + params = NULL, + page_size = 2000L, + max_records = Inf +) +} +\arguments{ +\item{layer_url}{The layer URL, ending in \code{/FeatureServer/} +or \code{/MapServer/}.} + +\item{where}{SQL-style WHERE filter (default \code{"1=1"}, all rows).} + +\item{out_fields}{Comma-separated field list (default \code{"*"}).} + +\item{params}{Optional named list of extra query parameters.} + +\item{page_size}{Records requested per page (default 2000).} + +\item{max_records}{Cap on the total number of records (default +\code{Inf} -- fetch the whole layer).} +} +\value{ +A data.frame of feature attributes (geometry is dropped). +} +\description{ +Pulls attribute records from an ArcGIS REST layer, paginating through +the server transfer limit automatically (ArcGIS caps a single query +at \code{maxRecordCount} features, typically 1000-2000). +} +\examples{ +\dontrun{ +layer <- paste0( + "https://services.arcgis.com/ORG/arcgis/rest/", + "services/Assault/FeatureServer/0" +) +df <- morie_fetch_arcgis(layer) +} +} +\seealso{ +\code{\link{morie_fetch}} +} diff --git a/r-package/morie/man/morie_fetch_ckan.Rd b/r-package/morie/man/morie_fetch_ckan.Rd index 81e866823a..0a31eebfac 100644 --- a/r-package/morie/man/morie_fetch_ckan.Rd +++ b/r-package/morie/man/morie_fetch_ckan.Rd @@ -4,14 +4,29 @@ \alias{morie_fetch_ckan} \title{Fetch data from the CKAN API and cache it} \usage{ -morie_fetch_ckan(dataset_key = "cpads", limit = 32000L, db_path = NULL) +morie_fetch_ckan( + dataset_key = "cpads", + limit = Inf, + db_path = NULL, + resource_id = NULL, + con = NULL +) } \arguments{ \item{dataset_key}{One of \code{"cpads"}, \code{"csads"}, \code{"csus"}.} -\item{limit}{Max records to fetch.} +\item{limit}{Maximum records to fetch. The CKAN datastore caps a +single request at 32000 rows, so larger resources are paged through +with \code{offset}; the default reads the entire resource.} \item{db_path}{Optional override for the database path.} + +\item{resource_id}{Optional CKAN datastore resource id. When supplied +(e.g. from \code{morie_dataset_catalog()$ckan_resource_id}) it is used +directly, so any catalogued dataset can be fetched without a built-in +database; \code{dataset_key} then only labels the cache table.} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ A data.frame. @@ -21,10 +36,10 @@ Fetch data from the CKAN API and cache it } \examples{ \dontrun{ - # Requires network access. Fetches the first 5000 rows of the - # Canadian Postsecondary Alcohol and Drug Use Survey from the - # Government of Canada CKAN datastore: - cpads <- morie_fetch_ckan(dataset_key = "cpads", limit = 5000L) - nrow(cpads) +# Requires network access. Fetches the first 5000 rows of the +# Canadian Postsecondary Alcohol and Drug Use Survey from the +# Government of Canada CKAN datastore: +cpads <- morie_fetch_ckan(dataset_key = "cpads", limit = 5000L) +nrow(cpads) } } diff --git a/r-package/morie/man/morie_fetch_siu.Rd b/r-package/morie/man/morie_fetch_siu.Rd index d15279e1d4..4128b5d158 100644 --- a/r-package/morie/man/morie_fetch_siu.Rd +++ b/r-package/morie/man/morie_fetch_siu.Rd @@ -1,43 +1,98 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mrm_samples.R +% Please edit documentation in R/siu.R \name{morie_fetch_siu} \alias{morie_fetch_siu} -\title{Fetch Ontario SIU Director's Reports into a local CSV} +\title{Fetch the Ontario SIU corpus into a 64-column SIU.csv} \usage{ morie_fetch_siu( - years = NULL, - cache_dir = "~/.cache/morie/siu", - overwrite = FALSE + cache_dir = file.path(tempdir(), "morie", "siu"), + overwrite = FALSE, + max_drid = NULL, + concurrency = 4L, + rate_rps = 4, + use_manifest = TRUE, + lang = c("all", "en", "fr"), + cache_html = FALSE, + progress = TRUE ) } \arguments{ -\item{years}{Optional integer vector of years to scrape. \code{NULL} -(default) scrapes the full unfiltered index.} +\item{cache_dir}{Output directory. Defaults to a session-scoped +subdirectory of \code{\link[base]{tempdir}()} that R cleans up +automatically. For persistent cross-session caching pass +\code{cache_dir = morie_cache_dir("siu")} instead; see +\code{\link{morie_cache_dir}} and \code{\link{morie_cache_clear}}.} -\item{cache_dir}{Output directory (default \code{"~/.cache/morie/siu"}).} +\item{overwrite}{Logical; if \code{FALSE} and \code{SIU.csv} already +exists in \code{cache_dir}, its path is returned without reparsing.} -\item{overwrite}{Logical; if \code{FALSE} and \code{SIU.csv} exists, returns -its path without rescraping.} +\item{max_drid}{Highest director's-report id to fetch. \code{NULL} +(default) uses the shipped manifest's max + a small margin, falling +back to discovery from the SIU site.} + +\item{concurrency}{Maximum simultaneous HTTP transfers. Default +\code{4} is a polite rate paired with \code{rate_rps = 4}; raising +either above ~8/8 risks triggering WAF interstitials that return +short non-report HTML.} + +\item{rate_rps}{Maximum request starts per second across the pool +(token-bucket throttle). Default \code{4} is the rate the package +was empirically validated against; lower it on poor connections +or contested endpoints.} + +\item{use_manifest}{If \code{TRUE} (default), restrict the sweep to +the known-valid drids in the shipped manifest +(\code{inst/extdata/siu_drid_manifest.csv.gz}), still topping up +with any drid above the manifest's max up to \code{max_drid}. +Cuts the fetch by ~30-50 percent on a typical run by skipping holes.} + +\item{lang}{Language filter on the manifest. \code{"all"} (default, +back-compat) fetches every known-valid drid -- English and +French copies of each case -- and then collapses to one row per +case_number with English winning the dedupe. \code{"en"} fetches +only the English drids (about half the size of the corpus and +half the network round trips); \code{"fr"} fetches only French. +Use \code{"en"} for the fastest cold-start when you only need +the canonical English text.} + +\item{cache_html}{If \code{TRUE}, gzip and save the raw HTML of +every fetched director's-report and news-release page under +\code{/html/drid_NNNN.html.gz} and +\code{/html/nrid_NNNN.html.gz}. This is the persistent +ground truth for every row in the emitted CSV: any later +discrepancy between the parser and a human coder can be +adjudicated against the saved HTML without re-hitting SIU. Adds +~80-100 MB to \code{cache_dir} for a full run; default +\code{FALSE} (the harvester remains lean unless you ask).} + +\item{progress}{Logical; print progress messages.} } \value{ -Path to the populated SIU.csv. +Path to the written \code{SIU.csv}. } \description{ -R wrapper around the Python \code{morie.siu_fetch.fetch_siu_cases()} -on-demand scraper. The R version delegates via \code{reticulate} so the -regex / HTML parsing lives in a single canonical location. +Fetches and parses the Ontario Special Investigations Unit +(police-oversight) corpus -- every director's report and the news +releases they link -- into a single CSV with the canonical +64-column schema, one row per case. } \details{ -The scraped corpus is NOT shipped with the package; each user runs -the scraper themselves, which is unambiguously fair use of public -oversight reports. +The parser is implemented entirely in C/C++ (\code{src/siu_parser.cpp}): +libcurl drives the HTTP transport and a concurrent \code{curl_multi} +pool fetches the ~9,000+ pages, while the 64-field extraction is C++ +\code{std::regex} parsing. There is no Python dependency. + +This is the \emph{Ontario} Special Investigations Unit -- distinct +from the federal Structured Intervention Units and from OTIS. The +parsed corpus is not shipped with the package; each user runs the +parser themselves, which is fair use of public oversight reports. } \examples{ \dontrun{ - # Network: scrapes the Ontario SIU Director's Reports site. - csv <- morie_fetch_siu(years = 2023:2024, - cache_dir = tempdir()) - siu <- utils::read.csv(csv) - table(siu$year) +# Network: parses the full Ontario SIU corpus (~15-25 min at the +# default polite rate of 4 RPS). +csv <- morie_fetch_siu(cache_dir = tempdir()) +siu <- utils::read.csv(csv) +nrow(siu) } } diff --git a/r-package/morie/man/morie_fetch_tps.Rd b/r-package/morie/man/morie_fetch_tps.Rd index 21189bb8c0..9b65a73e0c 100644 --- a/r-package/morie/man/morie_fetch_tps.Rd +++ b/r-package/morie/man/morie_fetch_tps.Rd @@ -6,7 +6,7 @@ \usage{ morie_fetch_tps( category, - cache_dir = "~/.cache/morie/tps", + cache_dir = file.path(tempdir(), "morie", "tps"), where = "1=1", overwrite = FALSE, max_per_page = 2000L @@ -15,8 +15,11 @@ morie_fetch_tps( \arguments{ \item{category}{One of \code{names(morie_tps_layer_urls())}.} -\item{cache_dir}{Directory for the CSV -(default \code{"~/.cache/morie/tps"}).} +\item{cache_dir}{Directory for the CSV. Defaults to a +session-scoped subdirectory of \code{tempdir()} that R cleans up +automatically. For persistent caching pass +\code{cache_dir = morie_cache_dir("tps")}; see +\link{morie_cache_dir} and \link{morie_cache_clear}.} \item{where}{ArcGIS SQL where clause (default \code{"1=1"}).} @@ -35,12 +38,14 @@ calls unless \code{overwrite = TRUE}. } \examples{ \dontrun{ - # Network: fetches major-crime indicators from the Toronto Police - # ArcGIS open-data layer. - csv <- morie_fetch_tps(category = "Assault", - cache_dir = tempdir(), - where = "OCC_YEAR = 2024") - tps <- utils::read.csv(csv) - nrow(tps) +# Network: fetches major-crime indicators from the Toronto Police +# ArcGIS open-data layer. +csv <- morie_fetch_tps( + category = "Assault", + cache_dir = tempdir(), + where = "OCC_YEAR = 2024" +) +tps <- utils::read.csv(csv) +nrow(tps) } } diff --git a/r-package/morie/man/morie_find_project_root.Rd b/r-package/morie/man/morie_find_project_root.Rd new file mode 100644 index 0000000000..9636be2f9b --- /dev/null +++ b/r-package/morie/man/morie_find_project_root.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/paths.R +\name{morie_find_project_root} +\alias{morie_find_project_root} +\title{Find a project root directory} +\usage{ +morie_find_project_root(start = getwd(), max_up = 10L) +} +\arguments{ +\item{start}{Starting directory.} + +\item{max_up}{Maximum number of parent traversals.} +} +\value{ +Absolute path to the detected project root. +} +\description{ +Searches upward from \code{start} for a directory containing the current +Sphinx/package-root markers, while still tolerating legacy Quarto-era +markers in older checkouts. +} +\examples{ +tryCatch(morie_find_project_root(), + error = function(e) message("not inside a morie project tree") +) +} diff --git a/r-package/morie/man/morie_fisher_exact_test.Rd b/r-package/morie/man/morie_fisher_exact_test.Rd new file mode 100644 index 0000000000..cb2d341247 --- /dev/null +++ b/r-package/morie/man/morie_fisher_exact_test.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_fisher_exact_test} +\alias{morie_fisher_exact_test} +\title{Fisher's exact test for 2x2 tables} +\usage{ +morie_fisher_exact_test( + table_2x2, + alternative = c("two.sided", "greater", "less") +) +} +\arguments{ +\item{table_2x2}{A 2x2 matrix or data frame of counts.} + +\item{alternative}{\code{"two.sided"}, \code{"greater"}, or \code{"less"}.} +} +\value{ +Named list: \code{odds_ratio}, \code{ci}, \code{p_value}. +} +\description{ +Fisher's exact test for 2x2 tables +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/fwpas_forward_pass_dense.Rd b/r-package/morie/man/morie_fwpas_forward_pass_dense.Rd similarity index 71% rename from r-package/morie/man/fwpas_forward_pass_dense.Rd rename to r-package/morie/man/morie_fwpas_forward_pass_dense.Rd index 452638e81c..42d18b2dd9 100644 --- a/r-package/morie/man/fwpas_forward_pass_dense.Rd +++ b/r-package/morie/man/morie_fwpas_forward_pass_dense.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/fwpas.R -\name{fwpas_forward_pass_dense} -\alias{fwpas_forward_pass_dense} -\alias{forward_pass_dense} +\name{morie_fwpas_forward_pass_dense} +\alias{morie_fwpas_forward_pass_dense} +\alias{morie_forward_pass_dense} \title{Dense layer forward pass} \usage{ -fwpas_forward_pass_dense(x, w, b, activation = "sigmoid") +morie_fwpas_forward_pass_dense(x, w, b, activation = "sigmoid") -forward_pass_dense(x, w, b, activation = "sigmoid") +morie_forward_pass_dense(x, w, b, activation = "sigmoid") } \arguments{ \item{x}{Numeric vector (single input) or matrix (rows = samples).} @@ -30,10 +30,7 @@ Pure-base-R parity for \code{morie.fn.fwpas.forward_pass_dense}. \deqn{z = W x + b, \quad a = \sigma(z)} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_fwpas_forward_pass_dense(x = rnorm(50), w = rnorm(3), b = rnorm(3)) } \references{ Goodfellow, Bengio & Courville (2016), Deep Learning, Ch 6. diff --git a/r-package/morie/man/ganls_gan_loss.Rd b/r-package/morie/man/morie_ganls_gan_loss.Rd similarity index 77% rename from r-package/morie/man/ganls_gan_loss.Rd rename to r-package/morie/man/morie_ganls_gan_loss.Rd index 3ffa67428f..ec6649a01e 100644 --- a/r-package/morie/man/ganls_gan_loss.Rd +++ b/r-package/morie/man/morie_ganls_gan_loss.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ganls.R -\name{ganls_gan_loss} -\alias{ganls_gan_loss} -\alias{gan_loss} +\name{morie_ganls_gan_loss} +\alias{morie_ganls_gan_loss} +\alias{morie_gan_loss} \title{GAN minimax / non-saturating loss} \usage{ -ganls_gan_loss(D_real, D_fake, kind = "minimax") +morie_ganls_gan_loss(D_real, D_fake, kind = "minimax") -gan_loss(D_real, D_fake, kind = "minimax") +morie_gan_loss(D_real, D_fake, kind = "minimax") } \arguments{ \item{D_real}{Discriminator outputs on real data (probabilities).} @@ -30,10 +30,7 @@ so \code{D_loss = -V(D)}. Two generator objectives are supported: \code{kind="nonsaturating"} (\code{G_loss = -E[log D(G(z))]}). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ganls_gan_loss(D_real = rnorm(20), D_fake = rnorm(20)) } \references{ Goodfellow et al. (2014), NeurIPS. diff --git a/r-package/morie/man/garch_fit.Rd b/r-package/morie/man/morie_garch_fit.Rd similarity index 73% rename from r-package/morie/man/garch_fit.Rd rename to r-package/morie/man/morie_garch_fit.Rd index 3e7cd15217..8e45f1183a 100644 --- a/r-package/morie/man/garch_fit.Rd +++ b/r-package/morie/man/morie_garch_fit.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/garch.R -\name{garch_fit} -\alias{garch_fit} +\name{morie_garch_fit} +\alias{morie_garch_fit} \title{Fit a GARCH(1,1) model to a return series} \usage{ -garch_fit(x) +morie_garch_fit(x) } \arguments{ \item{x}{Numeric return series.} @@ -17,8 +17,5 @@ Named list with \code{omega, alpha, beta, persistence, loglik, \deqn{\sigma_t^2 = \omega + \alpha \epsilon_{t-1}^2 + \beta \sigma_{t-1}^2.} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_garch_fit(x = rnorm(50)) } diff --git a/r-package/morie/man/gblup_full.Rd b/r-package/morie/man/morie_gblup_full.Rd similarity index 73% rename from r-package/morie/man/gblup_full.Rd rename to r-package/morie/man/morie_gblup_full.Rd index cb46e71c0c..20dcc44e24 100644 --- a/r-package/morie/man/gblup_full.Rd +++ b/r-package/morie/man/morie_gblup_full.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gblpf.R -\name{gblup_full} -\alias{gblup_full} +\name{morie_gblup_full} +\alias{morie_gblup_full} \title{GBLUP -- full mixed-model implementation} \usage{ -gblup_full(x, y, markers, lambda_gblup = NULL) +morie_gblup_full(x, y, markers, lambda_gblup = NULL) } \arguments{ \item{x}{Fixed-effect design (vector or matrix).} @@ -22,10 +22,7 @@ Named list (estimate, g_hat, beta, se, lambda_gblup, n, method). Solves Henderson's MME with VanRaden G. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_gblup_full(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ Montesinos Lopez Ch 3. diff --git a/r-package/morie/man/morie_generate_ar_coefficients.Rd b/r-package/morie/man/morie_generate_ar_coefficients.Rd index e99cc4de06..6618d555cd 100644 --- a/r-package/morie/man/morie_generate_ar_coefficients.Rd +++ b/r-package/morie/man/morie_generate_ar_coefficients.Rd @@ -28,8 +28,6 @@ A p x p numeric matrix A. Generate a stationarity-preserving AR coefficient matrix } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/morie_generate_synthetic_data.Rd b/r-package/morie/man/morie_generate_synthetic_data.Rd new file mode 100644 index 0000000000..fa1d721f8e --- /dev/null +++ b/r-package/morie/man/morie_generate_synthetic_data.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/synthetic.R +\name{morie_generate_synthetic_data} +\alias{morie_generate_synthetic_data} +\title{Generate synthetic epidemiology-style tabular data} +\usage{ +morie_generate_synthetic_data( + n = 5000L, + seed = 42L, + special_code_rate = 0.02, + profile = c("generic", "morie_legacy"), + name_map = NULL +) +} +\arguments{ +\item{n}{Number of rows.} + +\item{seed}{Random seed for reproducibility.} + +\item{special_code_rate}{Proportion of values replaced with survey-style +special missing codes (\code{97/98/99/997/998/999}) in discrete fields.} + +\item{profile}{Convenience profile for output naming; ignored when +\code{name_map} is supplied.} + +\item{name_map}{Optional named character vector mapping canonical keys to +output column names. Use \code{\link[=morie_default_synthetic_name_map]{morie_default_synthetic_name_map()}} as a template.} +} +\value{ +A data.frame with synthetic records. +} +\description{ +Generates non-identifying synthetic data suitable for development, testing, +and demos. The generator uses a canonical variable set and allows output +column renaming through \code{name_map} so it can be adapted to multiple studies. +Synthetic data should not be used for final inferential reporting. +} +\examples{ +df <- morie_generate_synthetic_data(n = 200, seed = 1) +nrow(df) +} diff --git a/r-package/morie/man/morie_generate_var_coefficients.Rd b/r-package/morie/man/morie_generate_var_coefficients.Rd index fcd5dbc856..07ceed20c8 100644 --- a/r-package/morie/man/morie_generate_var_coefficients.Rd +++ b/r-package/morie/man/morie_generate_var_coefficients.Rd @@ -30,8 +30,6 @@ A list of length \code{lags}, each a p x p matrix. Generate a VAR(L) coefficient array as a 3-d list } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/genomic_cross_validation.Rd b/r-package/morie/man/morie_genomic_cross_validation.Rd similarity index 70% rename from r-package/morie/man/genomic_cross_validation.Rd rename to r-package/morie/man/morie_genomic_cross_validation.Rd index 86c2fcf8dd..e13838c336 100644 --- a/r-package/morie/man/genomic_cross_validation.Rd +++ b/r-package/morie/man/morie_genomic_cross_validation.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gcvgn.R -\name{genomic_cross_validation} -\alias{genomic_cross_validation} +\name{morie_genomic_cross_validation} +\alias{morie_genomic_cross_validation} \title{K-fold cross-validation for genomic-prediction accuracy} \usage{ -genomic_cross_validation(x, y, K = 5, lam = 1, seed = 0) +morie_genomic_cross_validation(x, y, K = 5, lam = 1, seed = 0) } \arguments{ \item{x}{(n x p) predictor matrix.} @@ -24,10 +24,7 @@ list(estimate, r_per_fold, y_hat, mse, mspe, slope, n, K, method). K-fold cross-validation for genomic-prediction accuracy } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_genomic_cross_validation(x = rnorm(50), y = rnorm(50)) } \references{ Montesinos Lopez Ch 2. diff --git a/r-package/morie/man/ghosal_adaptation.Rd b/r-package/morie/man/morie_ghosal_adaptation.Rd similarity index 73% rename from r-package/morie/man/ghosal_adaptation.Rd rename to r-package/morie/man/morie_ghosal_adaptation.Rd index ada1a0919b..9fe0b3125b 100644 --- a/r-package/morie/man/ghosal_adaptation.Rd +++ b/r-package/morie/man/morie_ghosal_adaptation.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghadp.R -\name{ghosal_adaptation} -\alias{ghosal_adaptation} +\name{morie_ghosal_adaptation} +\alias{morie_ghosal_adaptation} \title{Adaptive contraction rates over a smoothness grid.} \usage{ -ghosal_adaptation(x, betas = NULL, d = 1) +morie_ghosal_adaptation(x, betas = NULL, d = 1) } \arguments{ \item{x}{Numeric data vector (used only for sample-size n).} @@ -20,8 +20,5 @@ Named list with estimate, betas, rates, best_beta, n, d, method. Adaptive contraction rates over a smoothness grid. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_adaptation(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_bernstein_von_mises.Rd b/r-package/morie/man/morie_ghosal_bernstein_von_mises.Rd similarity index 82% rename from r-package/morie/man/ghosal_bernstein_von_mises.Rd rename to r-package/morie/man/morie_ghosal_bernstein_von_mises.Rd index ef32a3cc92..9a76a9f5dc 100644 --- a/r-package/morie/man/ghosal_bernstein_von_mises.Rd +++ b/r-package/morie/man/morie_ghosal_bernstein_von_mises.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghbvm.R -\name{ghosal_bernstein_von_mises} -\alias{ghosal_bernstein_von_mises} +\name{morie_ghosal_bernstein_von_mises} +\alias{morie_ghosal_bernstein_von_mises} \title{BvM diagnostic for the mean functional under a DP prior.} \usage{ -ghosal_bernstein_von_mises( +morie_ghosal_bernstein_von_mises( x, theta0 = NULL, B = 500, @@ -34,8 +34,5 @@ wald, wald_pvalue, n, B, method. BvM diagnostic for the mean functional under a DP prior. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_bernstein_von_mises(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_contraction_rate.Rd b/r-package/morie/man/morie_ghosal_contraction_rate.Rd similarity index 70% rename from r-package/morie/man/ghosal_contraction_rate.Rd rename to r-package/morie/man/morie_ghosal_contraction_rate.Rd index bcd1911211..355091a181 100644 --- a/r-package/morie/man/ghosal_contraction_rate.Rd +++ b/r-package/morie/man/morie_ghosal_contraction_rate.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghcrt.R -\name{ghosal_contraction_rate} -\alias{ghosal_contraction_rate} +\name{morie_ghosal_contraction_rate} +\alias{morie_ghosal_contraction_rate} \title{Minimax posterior-contraction rate} \usage{ -ghosal_contraction_rate(x, beta = 1, d = 1) +morie_ghosal_contraction_rate(x, beta = 1, d = 1) } \arguments{ \item{x}{Numeric data vector (used only for sample-size n).} @@ -20,8 +20,5 @@ Named list with estimate, log_rate_correction, parametric_rate, n, beta, d, meth Returns eps_n = n raised to the power of -beta/(2*beta+d). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_contraction_rate(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_dirichlet_posterior.Rd b/r-package/morie/man/morie_ghosal_dirichlet_posterior.Rd similarity index 79% rename from r-package/morie/man/ghosal_dirichlet_posterior.Rd rename to r-package/morie/man/morie_ghosal_dirichlet_posterior.Rd index a7921a85e1..539c6df0a9 100644 --- a/r-package/morie/man/ghosal_dirichlet_posterior.Rd +++ b/r-package/morie/man/morie_ghosal_dirichlet_posterior.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghdir.R -\name{ghosal_dirichlet_posterior} -\alias{ghosal_dirichlet_posterior} +\name{morie_ghosal_dirichlet_posterior} +\alias{morie_ghosal_dirichlet_posterior} \title{Dirichlet-process posterior (conjugate update)} \usage{ -ghosal_dirichlet_posterior( +morie_ghosal_dirichlet_posterior( x, alpha = 1, base_mean = 0, @@ -31,8 +31,5 @@ G0 = N(base_mean, base_sd^2). Returns the posterior-mean CDF evaluated on a grid plus the headline \code{estimate} at \code{mean(x)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_dirichlet_posterior(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_dpmixture_density.Rd b/r-package/morie/man/morie_ghosal_dpmixture_density.Rd similarity index 83% rename from r-package/morie/man/ghosal_dpmixture_density.Rd rename to r-package/morie/man/morie_ghosal_dpmixture_density.Rd index 312d69f364..70e8fb2934 100644 --- a/r-package/morie/man/ghosal_dpmixture_density.Rd +++ b/r-package/morie/man/morie_ghosal_dpmixture_density.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghdpm.R -\name{ghosal_dpmixture_density} -\alias{ghosal_dpmixture_density} +\name{morie_ghosal_dpmixture_density} +\alias{morie_ghosal_dpmixture_density} \title{DP mixture density estimate (Neal 2000 algorithm 3)} \usage{ -ghosal_dpmixture_density( +morie_ghosal_dpmixture_density( x, alpha = 1, sigma = NULL, @@ -36,8 +36,5 @@ named list with \code{estimate}, \code{grid}, \code{density}, \code{k_post}, \co DP mixture density estimate (Neal 2000 algorithm 3) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_dpmixture_density(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_empirical_bayes.Rd b/r-package/morie/man/morie_ghosal_empirical_bayes.Rd similarity index 69% rename from r-package/morie/man/ghosal_empirical_bayes.Rd rename to r-package/morie/man/morie_ghosal_empirical_bayes.Rd index 0aba47acc1..5e59d6aa6a 100644 --- a/r-package/morie/man/ghosal_empirical_bayes.Rd +++ b/r-package/morie/man/morie_ghosal_empirical_bayes.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghebp.R -\name{ghosal_empirical_bayes} -\alias{ghosal_empirical_bayes} +\name{morie_ghosal_empirical_bayes} +\alias{morie_ghosal_empirical_bayes} \title{Empirical-Bayes alpha MLE for a DP, given the observed K_n.} \usage{ -ghosal_empirical_bayes(x, alpha_grid = NULL) +morie_ghosal_empirical_bayes(x, alpha_grid = NULL) } \arguments{ \item{x}{Numeric data vector.} @@ -18,8 +18,5 @@ Named list with estimate (alpha-hat), K_n, log_lik_at_estimate, n, method. Empirical-Bayes alpha MLE for a DP, given the observed K_n. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_empirical_bayes(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_gp_matern.Rd b/r-package/morie/man/morie_ghosal_gp_matern.Rd similarity index 82% rename from r-package/morie/man/ghosal_gp_matern.Rd rename to r-package/morie/man/morie_ghosal_gp_matern.Rd index df667b26a6..ffd1903abb 100644 --- a/r-package/morie/man/ghosal_gp_matern.Rd +++ b/r-package/morie/man/morie_ghosal_gp_matern.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghgpm.R -\name{ghosal_gp_matern} -\alias{ghosal_gp_matern} +\name{morie_ghosal_gp_matern} +\alias{morie_ghosal_gp_matern} \title{GP posterior mean with Matern kernel.} \usage{ -ghosal_gp_matern( +morie_ghosal_gp_matern( x, y, nu = 1.5, @@ -36,8 +36,5 @@ Named list with estimate, se, mu, sd, length_scale, nu, noise, n, method. GP posterior mean with Matern kernel. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_gp_matern(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_gp_squared_exponential.Rd b/r-package/morie/man/morie_ghosal_gp_squared_exponential.Rd similarity index 77% rename from r-package/morie/man/ghosal_gp_squared_exponential.Rd rename to r-package/morie/man/morie_ghosal_gp_squared_exponential.Rd index c2013edccc..be74c90737 100644 --- a/r-package/morie/man/ghosal_gp_squared_exponential.Rd +++ b/r-package/morie/man/morie_ghosal_gp_squared_exponential.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghgps.R -\name{ghosal_gp_squared_exponential} -\alias{ghosal_gp_squared_exponential} +\name{morie_ghosal_gp_squared_exponential} +\alias{morie_ghosal_gp_squared_exponential} \title{GP posterior mean with squared-exponential kernel.} \usage{ -ghosal_gp_squared_exponential( +morie_ghosal_gp_squared_exponential( x, y, length_scale = NULL, @@ -33,8 +33,5 @@ Named list with estimate, se, mu, sd, length_scale, noise, n, method. GP posterior mean with squared-exponential kernel. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_gp_squared_exponential(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_hierarchical_bayes.Rd b/r-package/morie/man/morie_ghosal_hierarchical_bayes.Rd similarity index 84% rename from r-package/morie/man/ghosal_hierarchical_bayes.Rd rename to r-package/morie/man/morie_ghosal_hierarchical_bayes.Rd index 41dc6bf74e..1890f2ecf9 100644 --- a/r-package/morie/man/ghosal_hierarchical_bayes.Rd +++ b/r-package/morie/man/morie_ghosal_hierarchical_bayes.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghhbp.R -\name{ghosal_hierarchical_bayes} -\alias{ghosal_hierarchical_bayes} +\name{morie_ghosal_hierarchical_bayes} +\alias{morie_ghosal_hierarchical_bayes} \title{Escobar-West augmentation for alpha given K_n with a Gamma(a, b) hyperprior.} \usage{ -ghosal_hierarchical_bayes( +morie_ghosal_hierarchical_bayes( x, a_prior = 1, b_prior = 1, @@ -37,8 +37,5 @@ alpha_draws, K_n, n, method. Escobar-West augmentation for alpha given K_n with a Gamma(a, b) hyperprior. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_hierarchical_bayes(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_log_density.Rd b/r-package/morie/man/morie_ghosal_log_density.Rd similarity index 71% rename from r-package/morie/man/ghosal_log_density.Rd rename to r-package/morie/man/morie_ghosal_log_density.Rd index 5c2f0a555e..638c5dd30f 100644 --- a/r-package/morie/man/ghosal_log_density.Rd +++ b/r-package/morie/man/morie_ghosal_log_density.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghlgd.R -\name{ghosal_log_density} -\alias{ghosal_log_density} +\name{morie_ghosal_log_density} +\alias{morie_ghosal_log_density} \title{Log-spline density estimator (Stone 1990, Ghosal Ch 8).} \usage{ -ghosal_log_density(x, K = 5, grid = NULL) +morie_ghosal_log_density(x, K = 5, grid = NULL) } \arguments{ \item{x}{Numeric data vector.} @@ -20,8 +20,5 @@ Named list with estimate, theta, log_lik, grid, log_density, K, n, method. Log-spline density estimator (Stone 1990, Ghosal Ch 8). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_log_density(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_moment_matching.Rd b/r-package/morie/man/morie_ghosal_moment_matching.Rd similarity index 81% rename from r-package/morie/man/ghosal_moment_matching.Rd rename to r-package/morie/man/morie_ghosal_moment_matching.Rd index 3c411eea26..c803b1b343 100644 --- a/r-package/morie/man/ghosal_moment_matching.Rd +++ b/r-package/morie/man/morie_ghosal_moment_matching.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghmmt.R -\name{ghosal_moment_matching} -\alias{ghosal_moment_matching} +\name{morie_ghosal_moment_matching} +\alias{morie_ghosal_moment_matching} \title{Posterior mean / variance of G(A) for DP(alpha, G0) and A = (A_lower, A_upper].} \usage{ -ghosal_moment_matching( +morie_ghosal_moment_matching( x, alpha = 1, A_lower = NULL, @@ -33,8 +33,5 @@ Named list with estimate, se, prior_mean, prior_var, n_A, n, alpha, method. Posterior mean / variance of G(A) for DP(alpha, G0) and A = (A_lower, A_upper]. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_moment_matching(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_neutral_right.Rd b/r-package/morie/man/morie_ghosal_neutral_right.Rd similarity index 72% rename from r-package/morie/man/ghosal_neutral_right.Rd rename to r-package/morie/man/morie_ghosal_neutral_right.Rd index 4b82f2ceab..7bba2fe264 100644 --- a/r-package/morie/man/ghosal_neutral_right.Rd +++ b/r-package/morie/man/morie_ghosal_neutral_right.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghntr.R -\name{ghosal_neutral_right} -\alias{ghosal_neutral_right} +\name{morie_ghosal_neutral_right} +\alias{morie_ghosal_neutral_right} \title{Neutral-to-the-right posterior survival (Doksum 1974).} \usage{ -ghosal_neutral_right(time, event = NULL, c = 1, lam0 = NULL) +morie_ghosal_neutral_right(time, event = NULL, c = 1, lam0 = NULL) } \arguments{ \item{time}{Numeric vector of observed times.} @@ -22,8 +22,5 @@ Named list with estimate, times, S_post, H_post, c, lam0, n, method. Neutral-to-the-right posterior survival (Doksum 1974). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_neutral_right(time = cumsum(rexp(50))) } diff --git a/r-package/morie/man/ghosal_np_classification.Rd b/r-package/morie/man/morie_ghosal_np_classification.Rd similarity index 78% rename from r-package/morie/man/ghosal_np_classification.Rd rename to r-package/morie/man/morie_ghosal_np_classification.Rd index 99c6e2bca2..63888e1f7d 100644 --- a/r-package/morie/man/ghosal_np_classification.Rd +++ b/r-package/morie/man/morie_ghosal_np_classification.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghcls.R -\name{ghosal_np_classification} -\alias{ghosal_np_classification} +\name{morie_ghosal_np_classification} +\alias{morie_ghosal_np_classification} \title{Probit-GP classifier (Laplace approximation).} \usage{ -ghosal_np_classification( +morie_ghosal_np_classification( x, y, length_scale = NULL, @@ -33,8 +33,5 @@ Named list with estimate, p_hat, accuracy, length_scale, n, method. Probit-GP classifier (Laplace approximation). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_np_classification(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_np_regression.Rd b/r-package/morie/man/morie_ghosal_np_regression.Rd similarity index 65% rename from r-package/morie/man/ghosal_np_regression.Rd rename to r-package/morie/man/morie_ghosal_np_regression.Rd index 744815ec55..4085b9e003 100644 --- a/r-package/morie/man/ghosal_np_regression.Rd +++ b/r-package/morie/man/morie_ghosal_np_regression.Rd @@ -1,10 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghreg.R -\name{ghosal_np_regression} -\alias{ghosal_np_regression} +\name{morie_ghosal_np_regression} +\alias{morie_ghosal_np_regression} \title{GP nonparametric regression} \usage{ -ghosal_np_regression(x, y, length_scale = NULL, sigma_f = 1, noise = NULL) +morie_ghosal_np_regression( + x, + y, + length_scale = NULL, + sigma_f = 1, + noise = NULL +) } \arguments{ \item{x}{Numeric vector or matrix of input points.} @@ -22,11 +28,8 @@ Named list with estimate, se, mu, sd, ci_lower, ci_upper, r2, log_marginal, length_scale, noise, n, method. } \description{ -Wraps \code{ghosal_gp_squared_exponential}. +Wraps \code{morie_ghosal_gp_squared_exponential}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_np_regression(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_np_testing.Rd b/r-package/morie/man/morie_ghosal_np_testing.Rd similarity index 73% rename from r-package/morie/man/ghosal_np_testing.Rd rename to r-package/morie/man/morie_ghosal_np_testing.Rd index 7f104bc270..f637167d6e 100644 --- a/r-package/morie/man/ghosal_np_testing.Rd +++ b/r-package/morie/man/morie_ghosal_np_testing.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghtst.R -\name{ghosal_np_testing} -\alias{ghosal_np_testing} +\name{morie_ghosal_np_testing} +\alias{morie_ghosal_np_testing} \title{Polya-tree Bayes factor for H0: F = N(loc, scale^2).} \usage{ -ghosal_np_testing(x, ref_loc = 0, ref_scale = 1, depth = 6, c = 1) +morie_ghosal_np_testing(x, ref_loc = 0, ref_scale = 1, depth = 6, c = 1) } \arguments{ \item{x}{Numeric data vector.} @@ -24,8 +24,5 @@ Named list with statistic (log BF), p_value, BF10, log_BF10, n, depth, method. Polya-tree Bayes factor for H0: F = N(loc, scale^2). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_np_testing(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_posterior_consistency.Rd b/r-package/morie/man/morie_ghosal_posterior_consistency.Rd similarity index 78% rename from r-package/morie/man/ghosal_posterior_consistency.Rd rename to r-package/morie/man/morie_ghosal_posterior_consistency.Rd index a18388daa0..5be21ba836 100644 --- a/r-package/morie/man/ghosal_posterior_consistency.Rd +++ b/r-package/morie/man/morie_ghosal_posterior_consistency.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghcon.R -\name{ghosal_posterior_consistency} -\alias{ghosal_posterior_consistency} +\name{morie_ghosal_posterior_consistency} +\alias{morie_ghosal_posterior_consistency} \title{Schwartz posterior-consistency diagnostic (Bayesian bootstrap).} \usage{ -ghosal_posterior_consistency( +morie_ghosal_posterior_consistency( x, ref_loc = NULL, ref_scale = NULL, @@ -33,8 +33,5 @@ Named list with estimate, ks_mean, ks_se, schwartz_bound, n, eps, method. Schwartz posterior-consistency diagnostic (Bayesian bootstrap). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_posterior_consistency(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_sieve_prior.Rd b/r-package/morie/man/morie_ghosal_sieve_prior.Rd similarity index 70% rename from r-package/morie/man/ghosal_sieve_prior.Rd rename to r-package/morie/man/morie_ghosal_sieve_prior.Rd index 4f15dd802f..2392c7bb82 100644 --- a/r-package/morie/man/ghosal_sieve_prior.Rd +++ b/r-package/morie/man/morie_ghosal_sieve_prior.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghsve.R -\name{ghosal_sieve_prior} -\alias{ghosal_sieve_prior} +\name{morie_ghosal_sieve_prior} +\alias{morie_ghosal_sieve_prior} \title{Bernstein-polynomial sieve density estimator (Petrone 1999).} \usage{ -ghosal_sieve_prior(x, K = NULL) +morie_ghosal_sieve_prior(x, K = NULL) } \arguments{ \item{x}{Numeric data vector.} @@ -18,8 +18,5 @@ Named list with estimate, log_lik_per_obs, weights, K, n, method. Bernstein-polynomial sieve density estimator (Petrone 1999). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_sieve_prior(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_stick_breaking_trunc.Rd b/r-package/morie/man/morie_ghosal_stick_breaking_trunc.Rd similarity index 83% rename from r-package/morie/man/ghosal_stick_breaking_trunc.Rd rename to r-package/morie/man/morie_ghosal_stick_breaking_trunc.Rd index d6bba04631..20cfc41bec 100644 --- a/r-package/morie/man/ghosal_stick_breaking_trunc.Rd +++ b/r-package/morie/man/morie_ghosal_stick_breaking_trunc.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghstk.R -\name{ghosal_stick_breaking_trunc} -\alias{ghosal_stick_breaking_trunc} +\name{morie_ghosal_stick_breaking_trunc} +\alias{morie_ghosal_stick_breaking_trunc} \title{Truncated stick-breaking representation of DP(alpha, G0).} \usage{ -ghosal_stick_breaking_trunc( +morie_ghosal_stick_breaking_trunc( x, alpha = 1, K = 50, @@ -40,8 +40,5 @@ trunc_err_bound, n, method. Truncated stick-breaking representation of DP(alpha, G0). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_stick_breaking_trunc(x = rnorm(50)) } diff --git a/r-package/morie/man/ghosal_survival_beta_process.Rd b/r-package/morie/man/morie_ghosal_survival_beta_process.Rd similarity index 69% rename from r-package/morie/man/ghosal_survival_beta_process.Rd rename to r-package/morie/man/morie_ghosal_survival_beta_process.Rd index d2c7fc1591..72afb79877 100644 --- a/r-package/morie/man/ghosal_survival_beta_process.Rd +++ b/r-package/morie/man/morie_ghosal_survival_beta_process.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghsrv.R -\name{ghosal_survival_beta_process} -\alias{ghosal_survival_beta_process} +\name{morie_ghosal_survival_beta_process} +\alias{morie_ghosal_survival_beta_process} \title{Beta-process posterior survival (Hjort 1990).} \usage{ -ghosal_survival_beta_process(time, event = NULL, c = 1, lam0 = NULL) +morie_ghosal_survival_beta_process(time, event = NULL, c = 1, lam0 = NULL) } \arguments{ \item{time}{Numeric vector of observed times.} @@ -22,8 +22,5 @@ Named list with estimate, times, S_post, H_post, c, lam0, n, method. Beta-process posterior survival (Hjort 1990). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_survival_beta_process(time = cumsum(rexp(50))) } diff --git a/r-package/morie/man/ghosal_wavelet_prior.Rd b/r-package/morie/man/morie_ghosal_wavelet_prior.Rd similarity index 70% rename from r-package/morie/man/ghosal_wavelet_prior.Rd rename to r-package/morie/man/morie_ghosal_wavelet_prior.Rd index 1fb1710217..ea37aa41d7 100644 --- a/r-package/morie/man/ghosal_wavelet_prior.Rd +++ b/r-package/morie/man/morie_ghosal_wavelet_prior.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ghwav.R -\name{ghosal_wavelet_prior} -\alias{ghosal_wavelet_prior} +\name{morie_ghosal_wavelet_prior} +\alias{morie_ghosal_wavelet_prior} \title{Haar-wavelet spike-and-slab BayesThresh estimator (Abramovich 1998).} \usage{ -ghosal_wavelet_prior(x, pi = 0.5, sigma = NULL, noise = NULL) +morie_ghosal_wavelet_prior(x, pi = 0.5, sigma = NULL, noise = NULL) } \arguments{ \item{x}{Numeric data vector.} @@ -22,8 +22,5 @@ Named list with estimate, fitted, noise, sigma, inclusion, n, method. Haar-wavelet spike-and-slab BayesThresh estimator (Abramovich 1998). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ghosal_wavelet_prior(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_gpl_compatible_licenses.Rd b/r-package/morie/man/morie_gpl_compatible_licenses.Rd index b6e9d4a282..caebf5b269 100644 --- a/r-package/morie/man/morie_gpl_compatible_licenses.Rd +++ b/r-package/morie/man/morie_gpl_compatible_licenses.Rd @@ -16,8 +16,5 @@ is GPL-3 compatible but not GPL-2 compatible; morie is GPL-2.0-only so the choice rests with downstream consumers. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_gpl_compatible_licenses() } diff --git a/r-package/morie/man/gradient_boosting_ensemble.Rd b/r-package/morie/man/morie_gradient_boosting_ensemble.Rd similarity index 85% rename from r-package/morie/man/gradient_boosting_ensemble.Rd rename to r-package/morie/man/morie_gradient_boosting_ensemble.Rd index c0e1da3e9a..da153ee1fd 100644 --- a/r-package/morie/man/gradient_boosting_ensemble.Rd +++ b/r-package/morie/man/morie_gradient_boosting_ensemble.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gbens.R -\name{gradient_boosting_ensemble} -\alias{gradient_boosting_ensemble} +\name{morie_gradient_boosting_ensemble} +\alias{morie_gradient_boosting_ensemble} \title{Gradient boosting ensemble (R parity)} \usage{ -gradient_boosting_ensemble( +morie_gradient_boosting_ensemble( x, y, n_estimators = 100L, @@ -44,8 +44,5 @@ Wraps \code{gbm::gbm} when available, otherwise falls back to \code{xgboost} as a portable boosted-trees backend. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_gradient_boosting_ensemble(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/gradient_boosting_genomic.Rd b/r-package/morie/man/morie_gradient_boosting_genomic.Rd similarity index 75% rename from r-package/morie/man/gradient_boosting_genomic.Rd rename to r-package/morie/man/morie_gradient_boosting_genomic.Rd index 5ca0162ae5..148becb58c 100644 --- a/r-package/morie/man/gradient_boosting_genomic.Rd +++ b/r-package/morie/man/morie_gradient_boosting_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gbgen.R -\name{gradient_boosting_genomic} -\alias{gradient_boosting_genomic} +\name{morie_gradient_boosting_genomic} +\alias{morie_gradient_boosting_genomic} \title{Gradient-boosting genomic predictor (Friedman 2001)} \usage{ -gradient_boosting_genomic( +morie_gradient_boosting_genomic( x, y, markers, @@ -36,10 +36,10 @@ list(estimate, y_hat, train_loss, se, n, method). Uses gbm if available; otherwise base-R boosted stumps. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_gradient_boosting_genomic( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ Friedman (2001); Montesinos Lopez Ch 9. diff --git a/r-package/morie/man/gradient_descent_vanilla.Rd b/r-package/morie/man/morie_gradient_descent_vanilla.Rd similarity index 73% rename from r-package/morie/man/gradient_descent_vanilla.Rd rename to r-package/morie/man/morie_gradient_descent_vanilla.Rd index 7128f20e2b..47c1305ca2 100644 --- a/r-package/morie/man/gradient_descent_vanilla.Rd +++ b/r-package/morie/man/morie_gradient_descent_vanilla.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/grdds.R -\name{gradient_descent_vanilla} -\alias{gradient_descent_vanilla} +\name{morie_gradient_descent_vanilla} +\alias{morie_gradient_descent_vanilla} \title{Vanilla batch gradient descent for OLS (R parity)} \usage{ -gradient_descent_vanilla(x, y, lr = 0.01, n_iter = 1000, tol = 1e-08) +morie_gradient_descent_vanilla(x, y, lr = 0.01, n_iter = 1000, tol = 1e-08) } \arguments{ \item{x}{Numeric matrix / vector of predictors.} @@ -26,8 +26,5 @@ theta := theta - lr * (2/n) X' (X theta - y), intercept included. Validates against \code{stats::lm} reference. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_gradient_descent_vanilla(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/grid_search_cv.Rd b/r-package/morie/man/morie_grid_search_cv.Rd similarity index 76% rename from r-package/morie/man/grid_search_cv.Rd rename to r-package/morie/man/morie_grid_search_cv.Rd index 01467f01b8..f8f0ae6f77 100644 --- a/r-package/morie/man/grid_search_cv.Rd +++ b/r-package/morie/man/morie_grid_search_cv.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gsrch.R -\name{grid_search_cv} -\alias{grid_search_cv} +\name{morie_grid_search_cv} +\alias{morie_grid_search_cv} \title{Grid search with cross-validation (R parity)} \usage{ -grid_search_cv( +morie_grid_search_cv( x, y, method = NULL, @@ -38,8 +38,9 @@ Wraps \code{caret::train} with method = "glm" (classification) or "lm" (regression) by default; users can pass any caret \code{method}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_grid_search_cv( + x = matrix(rnorm(150), 50, 3), y = rnorm(50), + method = "lm", tune_grid = data.frame(intercept = c(TRUE, FALSE)), + cv = 3L, task = "regression", seed = 1L +) } diff --git a/r-package/morie/man/grm_vanraden.Rd b/r-package/morie/man/morie_grm_vanraden.Rd similarity index 77% rename from r-package/morie/man/grm_vanraden.Rd rename to r-package/morie/man/morie_grm_vanraden.Rd index e519c615d7..50beac3d9e 100644 --- a/r-package/morie/man/grm_vanraden.Rd +++ b/r-package/morie/man/morie_grm_vanraden.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/aaa_helpers_montesinos.R -\name{grm_vanraden} -\alias{grm_vanraden} +\name{morie_grm_vanraden} +\alias{morie_grm_vanraden} \title{VanRaden Genomic Relationship Matrix} \usage{ -grm_vanraden(markers, method = 1) +morie_grm_vanraden(markers, method = 1) } \arguments{ \item{markers}{Numeric (n x m) genotype matrix coded (coded 0/1/2).} @@ -19,10 +19,7 @@ Computes G = ZZ' / (2 sum p_j(1-p_j)) for method 1 (default), or the per-locus-scaled variant for method 2. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_grm_vanraden(markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ VanRaden (2008) J Dairy Sci 91:4414. Montesinos Lopez Ch 3. diff --git a/r-package/morie/man/grucl_gru_cell.Rd b/r-package/morie/man/morie_grucl_gru_cell.Rd similarity index 86% rename from r-package/morie/man/grucl_gru_cell.Rd rename to r-package/morie/man/morie_grucl_gru_cell.Rd index 7b6697868d..419998da93 100644 --- a/r-package/morie/man/grucl_gru_cell.Rd +++ b/r-package/morie/man/morie_grucl_gru_cell.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/grucl.R -\name{grucl_gru_cell} -\alias{grucl_gru_cell} -\alias{gru_cell} +\name{morie_grucl_gru_cell} +\alias{morie_grucl_gru_cell} +\alias{morie_gru_cell} \title{GRU cell forward pass} \usage{ -grucl_gru_cell( +morie_grucl_gru_cell( x, h_prev = NULL, W = NULL, @@ -16,7 +16,7 @@ grucl_gru_cell( deterministic_seed = NULL ) -gru_cell( +morie_gru_cell( x, h_prev = NULL, W = NULL, @@ -55,10 +55,7 @@ R parity for \code{morie.fn.grucl.gru_cell}. Gates stacked as h = (1 - z) \odot n + z \odot h_\text{prev}} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_grucl_gru_cell(x = rnorm(50)) } \references{ Cho et al. (2014), EMNLP. diff --git a/r-package/morie/man/gxe_interaction_model.Rd b/r-package/morie/man/morie_gxe_interaction_model.Rd similarity index 71% rename from r-package/morie/man/gxe_interaction_model.Rd rename to r-package/morie/man/morie_gxe_interaction_model.Rd index bd815cece5..2b1fbf4bf6 100644 --- a/r-package/morie/man/gxe_interaction_model.Rd +++ b/r-package/morie/man/morie_gxe_interaction_model.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gxemd.R -\name{gxe_interaction_model} -\alias{gxe_interaction_model} +\name{morie_gxe_interaction_model} +\alias{morie_gxe_interaction_model} \title{Two-way GxE ANOVA with EMS variance components} \usage{ -gxe_interaction_model(x, y, env) +morie_gxe_interaction_model(x, y, env) } \arguments{ \item{x}{Genotype IDs (length n).} @@ -20,10 +20,8 @@ list(estimate, g, e, ge, var_g, var_e, var_ge, var_eps, se, n, method). Two-way GxE ANOVA with EMS variance components } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Montesinos Lopez Ch 11. diff --git a/r-package/morie/man/morie_hawkes_fit.Rd b/r-package/morie/man/morie_hawkes_fit.Rd index 5c57e23441..eb8cdcebe5 100644 --- a/r-package/morie/man/morie_hawkes_fit.Rd +++ b/r-package/morie/man/morie_hawkes_fit.Rd @@ -37,10 +37,8 @@ core (the same kernels the Python package uses); without a compiled core it falls back to a pure-R \eqn{O(n^2)} likelihood. } \examples{ -\dontrun{ - set.seed(1) - ev <- cumsum(rexp(200, rate = 2)) - fit <- morie_hawkes_fit(ev, kernel = "exponential") - print(fit) -} +set.seed(1) +ev <- cumsum(rexp(200, rate = 2)) +fit <- morie_hawkes_fit(ev, kernel = "exponential") +print(fit) } diff --git a/r-package/morie/man/morie_hedges_g.Rd b/r-package/morie/man/morie_hedges_g.Rd new file mode 100644 index 0000000000..e2395fa7f2 --- /dev/null +++ b/r-package/morie/man/morie_hedges_g.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_hedges_g} +\alias{morie_hedges_g} +\title{Hedges' g (bias-corrected Cohen's d)} +\usage{ +morie_hedges_g(x1, x2) +} +\arguments{ +\item{x1}{Numeric vector (group 1).} + +\item{x2}{Numeric vector (group 2).} +} +\value{ +Numeric Hedges' g. +} +\description{ +Hedges' g (bias-corrected Cohen's d) +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/heinz_he_initialization.Rd b/r-package/morie/man/morie_heinz_he_initialization.Rd similarity index 81% rename from r-package/morie/man/heinz_he_initialization.Rd rename to r-package/morie/man/morie_heinz_he_initialization.Rd index 9077adf8e6..2d751158b3 100644 --- a/r-package/morie/man/heinz_he_initialization.Rd +++ b/r-package/morie/man/morie_heinz_he_initialization.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/heinz.R -\name{heinz_he_initialization} -\alias{heinz_he_initialization} -\alias{he_initialization} +\name{morie_heinz_he_initialization} +\alias{morie_heinz_he_initialization} +\alias{morie_he_initialization} \title{He / Kaiming weight initialization} \usage{ -heinz_he_initialization( +morie_heinz_he_initialization( fan_in, fan_out = NULL, seed = 42L, @@ -13,7 +13,7 @@ heinz_he_initialization( deterministic_seed = NULL ) -he_initialization( +morie_he_initialization( fan_in, fan_out = NULL, seed = 42L, @@ -46,10 +46,8 @@ R parity for \code{morie.fn.heinz.he_initialization}. \deqn{W \sim \mathcal{N}\!\left(0, \tfrac{2}{n_{in}}\right)} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ He, Zhang, Ren & Sun (2015), ICCV. diff --git a/r-package/morie/man/hurst_r.Rd b/r-package/morie/man/morie_hurst_r.Rd similarity index 83% rename from r-package/morie/man/hurst_r.Rd rename to r-package/morie/man/morie_hurst_r.Rd index 4c3ec7b394..9107cd67e8 100644 --- a/r-package/morie/man/hurst_r.Rd +++ b/r-package/morie/man/morie_hurst_r.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/signal.R -\name{hurst_r} -\alias{hurst_r} +\name{morie_hurst_r} +\alias{morie_hurst_r} \title{Hurst exponent via rescaled-range (R/S) analysis} \usage{ -hurst_r(x) +morie_hurst_r(x) } \arguments{ \item{x}{Numeric vector (time series).} @@ -23,8 +23,8 @@ series. \eqn{H = 0.5} indicates uncorrelated (Brownian) increments; \donttest{ if (requireNamespace("pracma", quietly = TRUE)) { set.seed(1) - x <- cumsum(rnorm(2048)) # Brownian motion, expected H ~ 0.5 - res <- hurst_r(x) + x <- cumsum(rnorm(2048)) # Brownian motion, expected H ~ 0.5 + res <- morie_hurst_r(x) res$interpretation } } diff --git a/r-package/morie/man/infer_measurement_level.Rd b/r-package/morie/man/morie_infer_measurement_level.Rd similarity index 65% rename from r-package/morie/man/infer_measurement_level.Rd rename to r-package/morie/man/morie_infer_measurement_level.Rd index cd7c42eb3e..d25132d3bb 100644 --- a/r-package/morie/man/infer_measurement_level.Rd +++ b/r-package/morie/man/morie_infer_measurement_level.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataset_profile.R -\name{infer_measurement_level} -\alias{infer_measurement_level} +\name{morie_infer_measurement_level} +\alias{morie_infer_measurement_level} \title{Infer the measurement level of a vector} \usage{ -infer_measurement_level(x) +morie_infer_measurement_level(x) } \arguments{ \item{x}{A vector (any atomic type or factor).} @@ -24,9 +24,9 @@ Rules: logical or 2-level factor/character -> \code{"binary"}; ordered factor -> with non-negative range -> \code{"ratio"}; otherwise -> \code{"interval"}. } \examples{ -infer_measurement_level(c(0, 1, 1, 0)) # "binary" -infer_measurement_level(factor(c("a", "b", "c"))) # "nominal" -infer_measurement_level(ordered(c("low", "med", "high"))) # "ordinal" -infer_measurement_level(c(1.2, 3.4, 5.6)) # "ratio" -infer_measurement_level(c(-1.5, 0.0, 2.3)) # "interval" +morie_infer_measurement_level(c(0, 1, 1, 0)) # "binary" +morie_infer_measurement_level(factor(c("a", "b", "c"))) # "nominal" +morie_infer_measurement_level(ordered(c("low", "med", "high"))) # "ordinal" +morie_infer_measurement_level(c(1.2, 3.4, 5.6)) # "ratio" +morie_infer_measurement_level(c(-1.5, 0.0, 2.3)) # "interval" } diff --git a/r-package/morie/man/inspect_output.Rd b/r-package/morie/man/morie_inspect_output.Rd similarity index 88% rename from r-package/morie/man/inspect_output.Rd rename to r-package/morie/man/morie_inspect_output.Rd index edad67cdf3..b5dd4e2b01 100644 --- a/r-package/morie/man/inspect_output.Rd +++ b/r-package/morie/man/morie_inspect_output.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/inspector.R -\name{inspect_output} -\alias{inspect_output} +\name{morie_inspect_output} +\alias{morie_inspect_output} \title{Inspect a serialised analysis output (JSON, CSV, or RDS)} \usage{ -inspect_output(path) +morie_inspect_output(path) } \arguments{ \item{path}{Path to a JSON, CSV, or RDS file.} @@ -25,7 +25,7 @@ Supported formats: \code{.json} (via \code{jsonlite}), \code{.csv} (via base tmp <- tempfile(fileext = ".json") if (requireNamespace("jsonlite", quietly = TRUE)) { jsonlite::write_json(list(estimate = 0.123, se = 0.045), tmp) - inspect_output(tmp) + morie_inspect_output(tmp) unlink(tmp) } } diff --git a/r-package/morie/man/is_over_legal_limit.Rd b/r-package/morie/man/morie_is_over_legal_limit.Rd similarity index 62% rename from r-package/morie/man/is_over_legal_limit.Rd rename to r-package/morie/man/morie_is_over_legal_limit.Rd index 3b5c0be6ee..1083f2140f 100644 --- a/r-package/morie/man/is_over_legal_limit.Rd +++ b/r-package/morie/man/morie_is_over_legal_limit.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ebac.R -\name{is_over_legal_limit} -\alias{is_over_legal_limit} +\name{morie_is_over_legal_limit} +\alias{morie_is_over_legal_limit} \title{Test whether an eBAC exceeds a legal driving limit} \usage{ -is_over_legal_limit(ebac, limit = 0.08) +morie_is_over_legal_limit(ebac, limit = 0.08) } \arguments{ -\item{ebac}{Numeric eBAC value (e.g. from \code{\link[=calculate_ebac]{calculate_ebac()}}).} +\item{ebac}{Numeric eBAC value (e.g. from \code{\link[=morie_calculate_ebac]{morie_calculate_ebac()}}).} \item{limit}{Legal threshold (default 0.08, the per-se limit in most Canadian and US jurisdictions).} @@ -20,6 +20,6 @@ logical, to match the Python sibling and ease binary-outcome modelling.) Test whether an eBAC exceeds a legal driving limit } \examples{ -is_over_legal_limit(0.09) -is_over_legal_limit(0.05, limit = 0.05) +morie_is_over_legal_limit(0.09) +morie_is_over_legal_limit(0.05, limit = 0.05) } diff --git a/r-package/morie/man/morie_jackknife_estimate.Rd b/r-package/morie/man/morie_jackknife_estimate.Rd new file mode 100644 index 0000000000..65144a3bc2 --- /dev/null +++ b/r-package/morie/man/morie_jackknife_estimate.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_jackknife_estimate} +\alias{morie_jackknife_estimate} +\title{Delete-1 jackknife variance estimate} +\usage{ +morie_jackknife_estimate(df, statistic) +} +\arguments{ +\item{df}{A data frame.} + +\item{statistic}{A function taking a data frame and returning a scalar.} +} +\value{ +Named list: \code{estimate}, \code{se}, \code{bias}. +} +\description{ +Delete-1 jackknife variance estimate +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/johansen_cointegration.Rd b/r-package/morie/man/morie_johansen_cointegration.Rd similarity index 68% rename from r-package/morie/man/johansen_cointegration.Rd rename to r-package/morie/man/morie_johansen_cointegration.Rd index 9923209632..499918a6a3 100644 --- a/r-package/morie/man/johansen_cointegration.Rd +++ b/r-package/morie/man/morie_johansen_cointegration.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/johsn.R -\name{johansen_cointegration} -\alias{johansen_cointegration} +\name{morie_johansen_cointegration} +\alias{morie_johansen_cointegration} \title{Johansen trace test for cointegration} \usage{ -johansen_cointegration(x, k_ar_diff = 1) +morie_johansen_cointegration(x, k_ar_diff = 1) } \arguments{ \item{x}{Numeric matrix (T x k) of I(1) candidate series.} @@ -19,8 +19,6 @@ Named list with \code{eigenvalues, trace_stat, crit_values, Johansen trace test for cointegration } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/kalman_filter.Rd b/r-package/morie/man/morie_kalman_filter.Rd similarity index 71% rename from r-package/morie/man/kalman_filter.Rd rename to r-package/morie/man/morie_kalman_filter.Rd index 479f2e5725..fa942ac9ee 100644 --- a/r-package/morie/man/kalman_filter.Rd +++ b/r-package/morie/man/morie_kalman_filter.Rd @@ -1,15 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/kalmn.R -\name{kalman_filter} -\alias{kalman_filter} +\name{morie_kalman_filter} +\alias{morie_kalman_filter} \title{Kalman filter predict-update for a linear-Gaussian state-space model} \usage{ -kalman_filter(x, F = NULL, H = NULL, Q = NULL, R = NULL, x0 = NULL, P0 = NULL) +morie_kalman_filter( + x, + transition = NULL, + H = NULL, + Q = NULL, + R = NULL, + x0 = NULL, + P0 = NULL +) } \arguments{ \item{x}{Numeric vector or matrix of observations.} -\item{F}{Transition matrix (default identity).} +\item{transition}{Transition matrix (default identity).} \item{H}{Observation matrix (default identity).} @@ -29,8 +37,5 @@ Named list with \code{state, state_cov, innovations, Defaults to a univariate local-level model when matrices are omitted. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_kalman_filter(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_kendall_tau.Rd b/r-package/morie/man/morie_kendall_tau.Rd new file mode 100644 index 0000000000..1d753c423f --- /dev/null +++ b/r-package/morie/man/morie_kendall_tau.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_kendall_tau} +\alias{morie_kendall_tau} +\title{Kendall's tau-b} +\usage{ +morie_kendall_tau(x, y) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{y}{Numeric vector.} +} +\value{ +Named list: \code{tau}, \code{p_value}. +} +\description{ +Kendall's tau-b +} +\examples{ +morie_kendall_tau(x = rnorm(50), y = rnorm(50)) +} diff --git a/r-package/morie/man/kendall_tau_partial.Rd b/r-package/morie/man/morie_kendall_tau_partial.Rd similarity index 69% rename from r-package/morie/man/kendall_tau_partial.Rd rename to r-package/morie/man/morie_kendall_tau_partial.Rd index bcac1a1b25..771681fb8b 100644 --- a/r-package/morie/man/kendall_tau_partial.Rd +++ b/r-package/morie/man/morie_kendall_tau_partial.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ktaup.R -\name{kendall_tau_partial} -\alias{kendall_tau_partial} +\name{morie_kendall_tau_partial} +\alias{morie_kendall_tau_partial} \title{Kendall partial-tau correlation (Gibbons Ch 12.6)} \usage{ -kendall_tau_partial(x, y, z) +morie_kendall_tau_partial(x, y, z) } \arguments{ \item{x, y, z}{Numeric vectors of equal length.} @@ -18,8 +18,5 @@ tau_xy.z = (tau_xy - tau_xz tau_yz) / sqrt((1 - tau_xz^2)(1 - tau_yz^2)) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_kendall_tau_partial(x = rnorm(50), y = rnorm(50), z = rnorm(50)) } diff --git a/r-package/morie/man/kmeans_clustering.Rd b/r-package/morie/man/morie_kmeans_clustering.Rd similarity index 69% rename from r-package/morie/man/kmeans_clustering.Rd rename to r-package/morie/man/morie_kmeans_clustering.Rd index 20b1a44d40..a73f6fb6ac 100644 --- a/r-package/morie/man/kmeans_clustering.Rd +++ b/r-package/morie/man/morie_kmeans_clustering.Rd @@ -1,10 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/kmnsc.R -\name{kmeans_clustering} -\alias{kmeans_clustering} +\name{morie_kmeans_clustering} +\alias{morie_kmeans_clustering} \title{K-means clustering (R parity)} \usage{ -kmeans_clustering(x, n_clusters = 3L, n_init = 10L, max_iter = 300L, seed = 0L) +morie_kmeans_clustering( + x, + n_clusters = 3L, + n_init = 10L, + max_iter = 300L, + seed = 0L +) } \arguments{ \item{x}{Numeric matrix.} @@ -25,8 +31,5 @@ n_iter, n_clusters, n, method. Wraps \code{stats::kmeans} with Hartigan-Wong (the default). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_kmeans_clustering(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_kruskal_wallis_test.Rd b/r-package/morie/man/morie_kruskal_wallis_test.Rd new file mode 100644 index 0000000000..13cb6af9b0 --- /dev/null +++ b/r-package/morie/man/morie_kruskal_wallis_test.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_kruskal_wallis_test} +\alias{morie_kruskal_wallis_test} +\title{Kruskal-Wallis non-parametric ANOVA} +\usage{ +morie_kruskal_wallis_test(...) +} +\arguments{ +\item{...}{Numeric vectors, one per group.} +} +\value{ +Named list: \code{H}, \code{df}, \code{p_value}. +} +\description{ +Kruskal-Wallis non-parametric ANOVA +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/ksr01_kosorok_empirical_process.Rd b/r-package/morie/man/morie_ksr01_kosorok_empirical_process.Rd similarity index 66% rename from r-package/morie/man/ksr01_kosorok_empirical_process.Rd rename to r-package/morie/man/morie_ksr01_kosorok_empirical_process.Rd index 468a5bd53f..aabf0d312e 100644 --- a/r-package/morie/man/ksr01_kosorok_empirical_process.Rd +++ b/r-package/morie/man/morie_ksr01_kosorok_empirical_process.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr01.R -\name{ksr01_kosorok_empirical_process} -\alias{ksr01_kosorok_empirical_process} -\alias{kosorok_empirical_process} +\name{morie_ksr01_kosorok_empirical_process} +\alias{morie_ksr01_kosorok_empirical_process} +\alias{morie_kosorok_empirical_process} \title{Empirical process indexed by a function class} \usage{ -ksr01_kosorok_empirical_process(x, f = NULL, mu0 = 0) +morie_ksr01_kosorok_empirical_process(x, f = NULL, mu0 = 0) -kosorok_empirical_process(x, f = NULL, mu0 = 0) +morie_kosorok_empirical_process(x, f = NULL, mu0 = 0) } \arguments{ \item{x}{Numeric vector of IID observations.} @@ -25,10 +25,7 @@ statistic sqrt(n) * (P_n(f) - mu0) plus the empirical L2(P_n) standard deviation sigma_n(f). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr01_kosorok_empirical_process(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2. diff --git a/r-package/morie/man/ksr02_kosorok_donsker_class.Rd b/r-package/morie/man/morie_ksr02_kosorok_donsker_class.Rd similarity index 53% rename from r-package/morie/man/ksr02_kosorok_donsker_class.Rd rename to r-package/morie/man/morie_ksr02_kosorok_donsker_class.Rd index 206c834758..2a1a162da3 100644 --- a/r-package/morie/man/ksr02_kosorok_donsker_class.Rd +++ b/r-package/morie/man/morie_ksr02_kosorok_donsker_class.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr02.R -\name{ksr02_kosorok_donsker_class} -\alias{ksr02_kosorok_donsker_class} -\alias{kosorok_donsker_class} +\name{morie_ksr02_kosorok_donsker_class} +\alias{morie_ksr02_kosorok_donsker_class} +\alias{morie_kosorok_donsker_class} \title{Donsker-class verification via bracketing integral} \usage{ -ksr02_kosorok_donsker_class(x) +morie_ksr02_kosorok_donsker_class(x) -kosorok_donsker_class(x) +morie_kosorok_donsker_class(x) } \arguments{ \item{x}{Numeric vector (unused, kept for API parity).} @@ -17,13 +17,11 @@ Named list with estimate, n, method. } \description{ Computes J_[](1, F, L_2(P)) = int_0^1 sqrt(log N_brackets(e, F, L_2(P))) de -for the indicator class F of one-sided thresholds on X (Kosorok Ex 2.5.4), with bracketing number bounded by 2 over epsilon squared. +for the indicator class F of one-sided thresholds on X +(Kosorok Ex 2.5.4), with bracketing number bounded by 2 / epsilon^2. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr02_kosorok_donsker_class(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2 (Theorem 2.5.2). diff --git a/r-package/morie/man/ksr03_kosorok_glivenko_cantelli.Rd b/r-package/morie/man/morie_ksr03_kosorok_glivenko_cantelli.Rd similarity index 62% rename from r-package/morie/man/ksr03_kosorok_glivenko_cantelli.Rd rename to r-package/morie/man/morie_ksr03_kosorok_glivenko_cantelli.Rd index c6b5aca09a..db0a7dfc5f 100644 --- a/r-package/morie/man/ksr03_kosorok_glivenko_cantelli.Rd +++ b/r-package/morie/man/morie_ksr03_kosorok_glivenko_cantelli.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr03.R -\name{ksr03_kosorok_glivenko_cantelli} -\alias{ksr03_kosorok_glivenko_cantelli} -\alias{kosorok_glivenko_cantelli} +\name{morie_ksr03_kosorok_glivenko_cantelli} +\alias{morie_ksr03_kosorok_glivenko_cantelli} +\alias{morie_kosorok_glivenko_cantelli} \title{Glivenko-Cantelli theorem verification (one-sample KS)} \usage{ -ksr03_kosorok_glivenko_cantelli(x, cdf = "pnorm") +morie_ksr03_kosorok_glivenko_cantelli(x, cdf = "pnorm") -kosorok_glivenko_cantelli(x, cdf = "pnorm") +morie_kosorok_glivenko_cantelli(x, cdf = "pnorm") } \arguments{ \item{x}{Numeric vector.} @@ -22,10 +22,7 @@ sup_t |F_n(t) - F(t)| under a hypothesised CDF F. By Glivenko-Cantelli the statistic -> 0 a.s. when F is correct. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr03_kosorok_glivenko_cantelli(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2. diff --git a/r-package/morie/man/ksr04_kosorok_vc_dimension.Rd b/r-package/morie/man/morie_ksr04_kosorok_vc_dimension.Rd similarity index 64% rename from r-package/morie/man/ksr04_kosorok_vc_dimension.Rd rename to r-package/morie/man/morie_ksr04_kosorok_vc_dimension.Rd index a6adae755a..acbd23bd80 100644 --- a/r-package/morie/man/ksr04_kosorok_vc_dimension.Rd +++ b/r-package/morie/man/morie_ksr04_kosorok_vc_dimension.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr04.R -\name{ksr04_kosorok_vc_dimension} -\alias{ksr04_kosorok_vc_dimension} -\alias{kosorok_vc_dimension} +\name{morie_ksr04_kosorok_vc_dimension} +\alias{morie_ksr04_kosorok_vc_dimension} +\alias{morie_kosorok_vc_dimension} \title{VC dimension for affine half-spaces in R^d} \usage{ -ksr04_kosorok_vc_dimension(x) +morie_ksr04_kosorok_vc_dimension(x) -kosorok_vc_dimension(x) +morie_kosorok_vc_dimension(x) } \arguments{ \item{x}{Numeric matrix or vector; d is its number of columns.} @@ -19,10 +19,7 @@ Named list with estimate, n, method. VC dimension of the linear half-space classifier in R^d equals d+1. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr04_kosorok_vc_dimension(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2; Vapnik & Chervonenkis (1971). diff --git a/r-package/morie/man/ksr05_kosorok_bracketing_number.Rd b/r-package/morie/man/morie_ksr05_kosorok_bracketing_number.Rd similarity index 60% rename from r-package/morie/man/ksr05_kosorok_bracketing_number.Rd rename to r-package/morie/man/morie_ksr05_kosorok_bracketing_number.Rd index 5acdb01b6c..3834b45a53 100644 --- a/r-package/morie/man/ksr05_kosorok_bracketing_number.Rd +++ b/r-package/morie/man/morie_ksr05_kosorok_bracketing_number.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr05.R -\name{ksr05_kosorok_bracketing_number} -\alias{ksr05_kosorok_bracketing_number} -\alias{kosorok_bracketing_number} +\name{morie_ksr05_kosorok_bracketing_number} +\alias{morie_ksr05_kosorok_bracketing_number} +\alias{morie_kosorok_bracketing_number} \title{Bracketing number for the indicator class} \usage{ -ksr05_kosorok_bracketing_number(x, e = 0.1) +morie_ksr05_kosorok_bracketing_number(x, e = 0.1) -kosorok_bracketing_number(x, e = 0.1) +morie_kosorok_bracketing_number(x, e = 0.1) } \arguments{ \item{x}{Numeric vector (used only for n).} @@ -21,10 +21,7 @@ Named list with estimate, n, method. "bracketing number for one-sided threshold class" } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr05_kosorok_bracketing_number(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2. diff --git a/r-package/morie/man/ksr06_kosorok_maximal_inequality.Rd b/r-package/morie/man/morie_ksr06_kosorok_maximal_inequality.Rd similarity index 60% rename from r-package/morie/man/ksr06_kosorok_maximal_inequality.Rd rename to r-package/morie/man/morie_ksr06_kosorok_maximal_inequality.Rd index 0829102a00..588fae0588 100644 --- a/r-package/morie/man/ksr06_kosorok_maximal_inequality.Rd +++ b/r-package/morie/man/morie_ksr06_kosorok_maximal_inequality.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr06.R -\name{ksr06_kosorok_maximal_inequality} -\alias{ksr06_kosorok_maximal_inequality} -\alias{kosorok_maximal_inequality} +\name{morie_ksr06_kosorok_maximal_inequality} +\alias{morie_ksr06_kosorok_maximal_inequality} +\alias{morie_kosorok_maximal_inequality} \title{Maximal inequality bound for empirical processes} \usage{ -ksr06_kosorok_maximal_inequality(x) +morie_ksr06_kosorok_maximal_inequality(x) -kosorok_maximal_inequality(x) +morie_kosorok_maximal_inequality(x) } \arguments{ \item{x}{Numeric vector.} @@ -20,10 +20,7 @@ E* of sup |G_n(f)| <= J_bracket(theta_n, F, L_2(P)) * sigma_n for the indicator class with theta_n = 0.5. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr06_kosorok_maximal_inequality(x = rnorm(50)) } \references{ Kosorok (2008), Ch 2. diff --git a/r-package/morie/man/ksr07_kosorok_bootstrap_empirical.Rd b/r-package/morie/man/morie_ksr07_kosorok_bootstrap_empirical.Rd similarity index 72% rename from r-package/morie/man/ksr07_kosorok_bootstrap_empirical.Rd rename to r-package/morie/man/morie_ksr07_kosorok_bootstrap_empirical.Rd index e9eda5a0bf..9e32536195 100644 --- a/r-package/morie/man/ksr07_kosorok_bootstrap_empirical.Rd +++ b/r-package/morie/man/morie_ksr07_kosorok_bootstrap_empirical.Rd @@ -1,18 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr07.R -\name{ksr07_kosorok_bootstrap_empirical} -\alias{ksr07_kosorok_bootstrap_empirical} -\alias{kosorok_bootstrap_empirical} +\name{morie_ksr07_kosorok_bootstrap_empirical} +\alias{morie_ksr07_kosorok_bootstrap_empirical} +\alias{morie_kosorok_bootstrap_empirical} \title{Bootstrap consistency for the empirical process} \usage{ -ksr07_kosorok_bootstrap_empirical( +morie_ksr07_kosorok_bootstrap_empirical( x, B = 1000, seed = 0, deterministic_seed = NULL ) -kosorok_bootstrap_empirical(x, B = 1000, seed = 0, deterministic_seed = NULL) +morie_kosorok_bootstrap_empirical( + x, + B = 1000, + seed = 0, + deterministic_seed = NULL +) } \arguments{ \item{x}{Numeric vector.} @@ -34,10 +39,7 @@ G_n^\emph{(f) = sqrt(n)(P_n^} - P_n)(f). Monte-Carlo bootstrap of the sample mean; returns mean/SD of G_n^* across B replications. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr07_kosorok_bootstrap_empirical(x = rnorm(50)) } \references{ Kosorok (2008), Ch 10. diff --git a/r-package/morie/man/ksr08_kosorok_multiplier_bootstrap.Rd b/r-package/morie/man/morie_ksr08_kosorok_multiplier_bootstrap.Rd similarity index 71% rename from r-package/morie/man/ksr08_kosorok_multiplier_bootstrap.Rd rename to r-package/morie/man/morie_ksr08_kosorok_multiplier_bootstrap.Rd index e6e8f85e2f..e36217eb76 100644 --- a/r-package/morie/man/ksr08_kosorok_multiplier_bootstrap.Rd +++ b/r-package/morie/man/morie_ksr08_kosorok_multiplier_bootstrap.Rd @@ -1,18 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr08.R -\name{ksr08_kosorok_multiplier_bootstrap} -\alias{ksr08_kosorok_multiplier_bootstrap} -\alias{kosorok_multiplier_bootstrap} +\name{morie_ksr08_kosorok_multiplier_bootstrap} +\alias{morie_ksr08_kosorok_multiplier_bootstrap} +\alias{morie_kosorok_multiplier_bootstrap} \title{Gaussian multiplier bootstrap for Z-estimators} \usage{ -ksr08_kosorok_multiplier_bootstrap( +morie_ksr08_kosorok_multiplier_bootstrap( x, B = 1000, seed = 0, deterministic_seed = NULL ) -kosorok_multiplier_bootstrap(x, B = 1000, seed = 0, deterministic_seed = NULL) +morie_kosorok_multiplier_bootstrap( + x, + B = 1000, + seed = 0, + deterministic_seed = NULL +) } \arguments{ \item{x}{Numeric vector.} @@ -33,10 +38,7 @@ Named list with estimate, se, n, method. G_n_xi(f) = n raised to the power of -1/2 times sum_i xi_i (f(X_i) - P_n f), xi ~ N(0,1). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr08_kosorok_multiplier_bootstrap(x = rnorm(50)) } \references{ Kosorok (2008), Ch 10. diff --git a/r-package/morie/man/ksr09_kosorok_z_estimator.Rd b/r-package/morie/man/morie_ksr09_kosorok_z_estimator.Rd similarity index 65% rename from r-package/morie/man/ksr09_kosorok_z_estimator.Rd rename to r-package/morie/man/morie_ksr09_kosorok_z_estimator.Rd index 1d2aa31cbc..6d8c278936 100644 --- a/r-package/morie/man/ksr09_kosorok_z_estimator.Rd +++ b/r-package/morie/man/morie_ksr09_kosorok_z_estimator.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr09.R -\name{ksr09_kosorok_z_estimator} -\alias{ksr09_kosorok_z_estimator} -\alias{kosorok_z_estimator} +\name{morie_ksr09_kosorok_z_estimator} +\alias{morie_ksr09_kosorok_z_estimator} +\alias{morie_kosorok_z_estimator} \title{Z-estimator asymptotic distribution} \usage{ -ksr09_kosorok_z_estimator(x, y = NULL) +morie_ksr09_kosorok_z_estimator(x, y = NULL) -kosorok_z_estimator(x, y = NULL) +morie_kosorok_z_estimator(x, y = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -22,10 +22,7 @@ theta_n solves P_n psi(.; theta) = 0 with sandwich variance V = solve(A) \%\emph{\% B \%}\% t(solve(A)). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr09_kosorok_z_estimator(x = rnorm(50)) } \references{ Kosorok (2008), Ch 5. diff --git a/r-package/morie/man/ksr10_kosorok_m_estimator.Rd b/r-package/morie/man/morie_ksr10_kosorok_m_estimator.Rd similarity index 65% rename from r-package/morie/man/ksr10_kosorok_m_estimator.Rd rename to r-package/morie/man/morie_ksr10_kosorok_m_estimator.Rd index 4e1f89b2b0..1ffc9e9817 100644 --- a/r-package/morie/man/ksr10_kosorok_m_estimator.Rd +++ b/r-package/morie/man/morie_ksr10_kosorok_m_estimator.Rd @@ -1,13 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr10.R -\name{ksr10_kosorok_m_estimator} -\alias{ksr10_kosorok_m_estimator} -\alias{kosorok_m_estimator} +\name{morie_ksr10_kosorok_m_estimator} +\alias{morie_ksr10_kosorok_m_estimator} +\alias{morie_kosorok_m_estimator} \title{Huber-M location estimator with profiled scale} \usage{ -ksr10_kosorok_m_estimator(x, y = NULL, k = 1.345, max_iter = 100, tol = 1e-10) +morie_ksr10_kosorok_m_estimator( + x, + y = NULL, + k = 1.345, + max_iter = 100, + tol = 1e-10 +) -kosorok_m_estimator(x, y = NULL, k = 1.345, max_iter = 100, tol = 1e-10) +morie_kosorok_m_estimator(x, y = NULL, k = 1.345, max_iter = 100, tol = 1e-10) } \arguments{ \item{x}{Numeric vector.} @@ -27,10 +33,7 @@ Named list with estimate, se, n, method. theta_n = argmax_theta P_n m(.; theta, eta_n), eta_n = MAD/0.6745. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr10_kosorok_m_estimator(x = rnorm(50)) } \references{ Kosorok (2008), Ch 5; Huber (1981). diff --git a/r-package/morie/man/ksr11_kosorok_efficient_score.Rd b/r-package/morie/man/morie_ksr11_kosorok_efficient_score.Rd similarity index 63% rename from r-package/morie/man/ksr11_kosorok_efficient_score.Rd rename to r-package/morie/man/morie_ksr11_kosorok_efficient_score.Rd index 4e2fe4e6ff..574ef4c14b 100644 --- a/r-package/morie/man/ksr11_kosorok_efficient_score.Rd +++ b/r-package/morie/man/morie_ksr11_kosorok_efficient_score.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr11.R -\name{ksr11_kosorok_efficient_score} -\alias{ksr11_kosorok_efficient_score} -\alias{kosorok_efficient_score} +\name{morie_ksr11_kosorok_efficient_score} +\alias{morie_ksr11_kosorok_efficient_score} +\alias{morie_kosorok_efficient_score} \title{Efficient score for OLS beta in the linear model} \usage{ -ksr11_kosorok_efficient_score(x, y) +morie_ksr11_kosorok_efficient_score(x, y) -kosorok_efficient_score(x, y) +morie_kosorok_efficient_score(x, y) } \arguments{ \item{x}{Numeric covariate vector.} @@ -22,10 +22,7 @@ se (residual sd), n, method. S_eff(X, Y) = (Y - E(Y|X))(X - E(X)) / sigma^2. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr11_kosorok_efficient_score(x = rnorm(50), y = rnorm(50)) } \references{ Kosorok (2008), Ch 6. diff --git a/r-package/morie/man/ksr12_kosorok_information_bound.Rd b/r-package/morie/man/morie_ksr12_kosorok_information_bound.Rd similarity index 63% rename from r-package/morie/man/ksr12_kosorok_information_bound.Rd rename to r-package/morie/man/morie_ksr12_kosorok_information_bound.Rd index 3bcef8a1f6..d72706a4a7 100644 --- a/r-package/morie/man/ksr12_kosorok_information_bound.Rd +++ b/r-package/morie/man/morie_ksr12_kosorok_information_bound.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr12.R -\name{ksr12_kosorok_information_bound} -\alias{ksr12_kosorok_information_bound} -\alias{kosorok_information_bound} +\name{morie_ksr12_kosorok_information_bound} +\alias{morie_ksr12_kosorok_information_bound} +\alias{morie_kosorok_information_bound} \title{Semiparametric information bound I_eff = Var(X)/sigma^2} \usage{ -ksr12_kosorok_information_bound(x, y) +morie_ksr12_kosorok_information_bound(x, y) -kosorok_information_bound(x, y) +morie_kosorok_information_bound(x, y) } \arguments{ \item{x}{Numeric covariate vector.} @@ -22,10 +22,7 @@ For the linear model Y = beta X + eps, this is the Fisher-info lower bound on Var(sqrt(n) beta_hat). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr12_kosorok_information_bound(x = rnorm(50), y = rnorm(50)) } \references{ Kosorok (2008), Ch 6. diff --git a/r-package/morie/man/ksr13_kosorok_tangent_space.Rd b/r-package/morie/man/morie_ksr13_kosorok_tangent_space.Rd similarity index 63% rename from r-package/morie/man/ksr13_kosorok_tangent_space.Rd rename to r-package/morie/man/morie_ksr13_kosorok_tangent_space.Rd index aacedbc989..e59f241d79 100644 --- a/r-package/morie/man/ksr13_kosorok_tangent_space.Rd +++ b/r-package/morie/man/morie_ksr13_kosorok_tangent_space.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr13.R -\name{ksr13_kosorok_tangent_space} -\alias{ksr13_kosorok_tangent_space} -\alias{kosorok_tangent_space} +\name{morie_ksr13_kosorok_tangent_space} +\alias{morie_ksr13_kosorok_tangent_space} +\alias{morie_kosorok_tangent_space} \title{Tangent-space dimension via empirical-score rank} \usage{ -ksr13_kosorok_tangent_space(x) +morie_ksr13_kosorok_tangent_space(x) -kosorok_tangent_space(x) +morie_kosorok_tangent_space(x) } \arguments{ \item{x}{Numeric vector.} @@ -20,10 +20,7 @@ Score basis (x - mean(x)) and (x^2 - mean(x^2)); returns rank of empirical Gram matrix at tolerance 1e-10. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr13_kosorok_tangent_space(x = rnorm(50)) } \references{ Kosorok (2008), Ch 6. diff --git a/r-package/morie/man/ksr14_kosorok_profile_likelihood.Rd b/r-package/morie/man/morie_ksr14_kosorok_profile_likelihood.Rd similarity index 63% rename from r-package/morie/man/ksr14_kosorok_profile_likelihood.Rd rename to r-package/morie/man/morie_ksr14_kosorok_profile_likelihood.Rd index 0afc3cf1f3..c8d3e3a296 100644 --- a/r-package/morie/man/ksr14_kosorok_profile_likelihood.Rd +++ b/r-package/morie/man/morie_ksr14_kosorok_profile_likelihood.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr14.R -\name{ksr14_kosorok_profile_likelihood} -\alias{ksr14_kosorok_profile_likelihood} -\alias{kosorok_profile_likelihood} +\name{morie_ksr14_kosorok_profile_likelihood} +\alias{morie_ksr14_kosorok_profile_likelihood} +\alias{morie_kosorok_profile_likelihood} \title{Profile likelihood for the linear-regression slope} \usage{ -ksr14_kosorok_profile_likelihood(x, y) +morie_ksr14_kosorok_profile_likelihood(x, y) -kosorok_profile_likelihood(x, y) +morie_kosorok_profile_likelihood(x, y) } \arguments{ \item{x}{Numeric covariate vector.} @@ -22,10 +22,7 @@ Y = beta X + eps with eps ~ N(0, sigma^2); profiling sigma^2 gives OLS slope with observed-information SE sqrt(sigma2/Sxx). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr14_kosorok_profile_likelihood(x = rnorm(50), y = rnorm(50)) } \references{ Kosorok (2008), Ch 7. diff --git a/r-package/morie/man/ksr15_kosorok_one_step_estimator.Rd b/r-package/morie/man/morie_ksr15_kosorok_one_step_estimator.Rd similarity index 58% rename from r-package/morie/man/ksr15_kosorok_one_step_estimator.Rd rename to r-package/morie/man/morie_ksr15_kosorok_one_step_estimator.Rd index f5be0efe30..edec47fbe1 100644 --- a/r-package/morie/man/ksr15_kosorok_one_step_estimator.Rd +++ b/r-package/morie/man/morie_ksr15_kosorok_one_step_estimator.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr15.R -\name{ksr15_kosorok_one_step_estimator} -\alias{ksr15_kosorok_one_step_estimator} -\alias{kosorok_one_step_estimator} +\name{morie_ksr15_kosorok_one_step_estimator} +\alias{morie_ksr15_kosorok_one_step_estimator} +\alias{morie_kosorok_one_step_estimator} \title{One-step efficient location estimator} \usage{ -ksr15_kosorok_one_step_estimator(x, y = NULL) +morie_ksr15_kosorok_one_step_estimator(x, y = NULL) -kosorok_one_step_estimator(x, y = NULL) +morie_kosorok_one_step_estimator(x, y = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -21,10 +21,7 @@ Named list with estimate, se, n, method. theta_tilde = theta_init + (1/n) * sum IF(X_i; theta_init). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr15_kosorok_one_step_estimator(x = rnorm(50)) } \references{ Kosorok (2008), Ch 7. diff --git a/r-package/morie/man/ksr16_kosorok_influence_function.Rd b/r-package/morie/man/morie_ksr16_kosorok_influence_function.Rd similarity index 59% rename from r-package/morie/man/ksr16_kosorok_influence_function.Rd rename to r-package/morie/man/morie_ksr16_kosorok_influence_function.Rd index 003c8be976..2dcc1e773d 100644 --- a/r-package/morie/man/ksr16_kosorok_influence_function.Rd +++ b/r-package/morie/man/morie_ksr16_kosorok_influence_function.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr16.R -\name{ksr16_kosorok_influence_function} -\alias{ksr16_kosorok_influence_function} -\alias{kosorok_influence_function} +\name{morie_ksr16_kosorok_influence_function} +\alias{morie_ksr16_kosorok_influence_function} +\alias{morie_kosorok_influence_function} \title{Influence function of OLS beta-hat} \usage{ -ksr16_kosorok_influence_function(x, y) +morie_ksr16_kosorok_influence_function(x, y) -kosorok_influence_function(x, y) +morie_kosorok_influence_function(x, y) } \arguments{ \item{x}{Numeric covariate vector.} @@ -21,10 +21,7 @@ Named list with estimate, n, method. IF(x, y) = (y - ybar - beta_hat (x - xbar))(x - xbar) / Var(X). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr16_kosorok_influence_function(x = rnorm(50), y = rnorm(50)) } \references{ Kosorok (2008), Ch 7. diff --git a/r-package/morie/man/ksr17_kosorok_counting_process.Rd b/r-package/morie/man/morie_ksr17_kosorok_counting_process.Rd similarity index 60% rename from r-package/morie/man/ksr17_kosorok_counting_process.Rd rename to r-package/morie/man/morie_ksr17_kosorok_counting_process.Rd index be3d35f02b..e3ee40c4ae 100644 --- a/r-package/morie/man/ksr17_kosorok_counting_process.Rd +++ b/r-package/morie/man/morie_ksr17_kosorok_counting_process.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr17.R -\name{ksr17_kosorok_counting_process} -\alias{ksr17_kosorok_counting_process} -\alias{kosorok_counting_process} +\name{morie_ksr17_kosorok_counting_process} +\alias{morie_ksr17_kosorok_counting_process} +\alias{morie_kosorok_counting_process} \title{Counting process for survival data} \usage{ -ksr17_kosorok_counting_process(t, event) +morie_ksr17_kosorok_counting_process(t, event) -kosorok_counting_process(t, event) +morie_kosorok_counting_process(t, event) } \arguments{ \item{t}{Numeric vector of observed times.} @@ -21,10 +21,7 @@ Named list with estimate (total events), n, method. N(infty) = the count of events (sum of indicator delta_i = 1). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr17_kosorok_counting_process(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) } \references{ Kosorok (2008), Ch 8. diff --git a/r-package/morie/man/ksr18_kosorok_nelson_aalen.Rd b/r-package/morie/man/morie_ksr18_kosorok_nelson_aalen.Rd similarity index 63% rename from r-package/morie/man/ksr18_kosorok_nelson_aalen.Rd rename to r-package/morie/man/morie_ksr18_kosorok_nelson_aalen.Rd index a92d1ccac1..bbf5f81480 100644 --- a/r-package/morie/man/ksr18_kosorok_nelson_aalen.Rd +++ b/r-package/morie/man/morie_ksr18_kosorok_nelson_aalen.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr18.R -\name{ksr18_kosorok_nelson_aalen} -\alias{ksr18_kosorok_nelson_aalen} -\alias{kosorok_nelson_aalen} +\name{morie_ksr18_kosorok_nelson_aalen} +\alias{morie_ksr18_kosorok_nelson_aalen} +\alias{morie_kosorok_nelson_aalen} \title{Nelson-Aalen cumulative hazard at the largest event time} \usage{ -ksr18_kosorok_nelson_aalen(t, event) +morie_ksr18_kosorok_nelson_aalen(t, event) -kosorok_nelson_aalen(t, event) +morie_kosorok_nelson_aalen(t, event) } \arguments{ \item{t}{Numeric vector of observed times.} @@ -21,10 +21,7 @@ Named list with estimate, se, n, method. Lambda_hat(t) = sum over t_i <= t of d_i / Y_i. Variance: sum d_i/Y_i^2. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr18_kosorok_nelson_aalen(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) } \references{ Kosorok (2008), Ch 8. diff --git a/r-package/morie/man/ksr19_kosorok_cox_partial_likelihood.Rd b/r-package/morie/man/morie_ksr19_kosorok_cox_partial_likelihood.Rd similarity index 62% rename from r-package/morie/man/ksr19_kosorok_cox_partial_likelihood.Rd rename to r-package/morie/man/morie_ksr19_kosorok_cox_partial_likelihood.Rd index e8549c0172..3d1b919e63 100644 --- a/r-package/morie/man/ksr19_kosorok_cox_partial_likelihood.Rd +++ b/r-package/morie/man/morie_ksr19_kosorok_cox_partial_likelihood.Rd @@ -1,13 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr19.R -\name{ksr19_kosorok_cox_partial_likelihood} -\alias{ksr19_kosorok_cox_partial_likelihood} -\alias{kosorok_cox_partial_likelihood} +\name{morie_ksr19_kosorok_cox_partial_likelihood} +\alias{morie_ksr19_kosorok_cox_partial_likelihood} +\alias{morie_kosorok_cox_partial_likelihood} \title{Scalar-covariate Cox proportional-hazards partial-likelihood MLE} \usage{ -ksr19_kosorok_cox_partial_likelihood(x, t, event, tol = 1e-10, max_iter = 100) +morie_ksr19_kosorok_cox_partial_likelihood( + x, + t, + event, + tol = 1e-10, + max_iter = 100 +) -kosorok_cox_partial_likelihood(x, t, event, tol = 1e-10, max_iter = 100) +morie_kosorok_cox_partial_likelihood(x, t, event, tol = 1e-10, max_iter = 100) } \arguments{ \item{x}{Numeric covariate vector.} @@ -29,10 +35,10 @@ l(beta) = sum over events of (X_i beta - log sum over j in R(t_i) of exp(X_j bet Breslow tie handling. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr19_kosorok_cox_partial_likelihood( + x = rnorm(50), + t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8) +) } \references{ Kosorok (2008), Ch 8. diff --git a/r-package/morie/man/ksr20_kosorok_censoring_survival.Rd b/r-package/morie/man/morie_ksr20_kosorok_censoring_survival.Rd similarity index 59% rename from r-package/morie/man/ksr20_kosorok_censoring_survival.Rd rename to r-package/morie/man/morie_ksr20_kosorok_censoring_survival.Rd index dae75372a5..3294d67124 100644 --- a/r-package/morie/man/ksr20_kosorok_censoring_survival.Rd +++ b/r-package/morie/man/morie_ksr20_kosorok_censoring_survival.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ksr20.R -\name{ksr20_kosorok_censoring_survival} -\alias{ksr20_kosorok_censoring_survival} -\alias{kosorok_censoring_survival} +\name{morie_ksr20_kosorok_censoring_survival} +\alias{morie_ksr20_kosorok_censoring_survival} +\alias{morie_kosorok_censoring_survival} \title{Kaplan-Meier estimator of the censoring distribution} \usage{ -ksr20_kosorok_censoring_survival(t, event) +morie_ksr20_kosorok_censoring_survival(t, event) -kosorok_censoring_survival(t, event) +morie_kosorok_censoring_survival(t, event) } \arguments{ \item{t}{Numeric vector of observed times.} @@ -21,10 +21,7 @@ Named list with estimate, se, n, method. S_C(t) by KM on (t_i, 1 - delta_i). Greenwood SE. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_ksr20_kosorok_censoring_survival(t = seq(0, 1, length.out = 50), event = rbinom(50, 1, 0.8)) } \references{ Kosorok (2008), Ch 8. diff --git a/r-package/morie/man/learning_curve.Rd b/r-package/morie/man/morie_learning_curve.Rd similarity index 70% rename from r-package/morie/man/learning_curve.Rd rename to r-package/morie/man/morie_learning_curve.Rd index 9e963ce0e8..9b8587a8a3 100644 --- a/r-package/morie/man/learning_curve.Rd +++ b/r-package/morie/man/morie_learning_curve.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/lrcvg.R -\name{learning_curve} -\alias{learning_curve} +\name{morie_learning_curve} +\alias{morie_learning_curve} \title{Learning curve -- train / val MSE vs training-set size (R parity)} \usage{ -learning_curve(x, y, sizes = NULL, cv = 5L, seed = 0L) +morie_learning_curve(x, y, sizes = NULL, cv = 5L, seed = 0L) } \arguments{ \item{x}{Numeric matrix predictors.} @@ -22,13 +22,10 @@ Named list: estimate (final val MSE), train_sizes, train_scores, val_scores, n, method. } \description{ -Manual implementation of the sklearn learning_curve flow: shuffle, +Manual implementation of the sklearn morie_learning_curve flow: shuffle, split into k folds, for each train-fraction fit on a prefix of the training fold and score on the held-out fold. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_learning_curve(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_levene_test.Rd b/r-package/morie/man/morie_levene_test.Rd new file mode 100644 index 0000000000..93efde16c2 --- /dev/null +++ b/r-package/morie/man/morie_levene_test.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_levene_test} +\alias{morie_levene_test} +\title{Levene test for equality of variances} +\usage{ +morie_levene_test(...) +} +\arguments{ +\item{...}{Numeric vectors, one per group.} +} +\value{ +Named list: \code{F}, \code{p_value}. +} +\description{ +Levene test for equality of variances +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_license_metadata.Rd b/r-package/morie/man/morie_license_metadata.Rd index f39c673734..2d2f4053dc 100644 --- a/r-package/morie/man/morie_license_metadata.Rd +++ b/r-package/morie/man/morie_license_metadata.Rd @@ -15,8 +15,5 @@ compliance pipelines. morie's SPDX-style licence metadata } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_license_metadata() } diff --git a/r-package/morie/man/linear_regression_ols.Rd b/r-package/morie/man/morie_linear_regression_ols.Rd similarity index 75% rename from r-package/morie/man/linear_regression_ols.Rd rename to r-package/morie/man/morie_linear_regression_ols.Rd index 855ac7dcd3..93f02c2ab0 100644 --- a/r-package/morie/man/linear_regression_ols.Rd +++ b/r-package/morie/man/morie_linear_regression_ols.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/linrg.R -\name{linear_regression_ols} -\alias{linear_regression_ols} +\name{morie_linear_regression_ols} +\alias{morie_linear_regression_ols} \title{Ordinary least squares closed-form solution (R parity)} \usage{ -linear_regression_ols(x, y) +morie_linear_regression_ols(x, y) } \arguments{ \item{x}{Numeric matrix or vector of predictors.} @@ -20,10 +20,7 @@ Wraps \code{stats::lm} and returns coefficients plus classical OLS standard errors. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_linear_regression_ols(x = rnorm(50), y = rnorm(50)) } \references{ Hastie, Tibshirani & Friedman, Elements of Statistical Learning (2009). diff --git a/r-package/morie/man/morie_list_datasets.Rd b/r-package/morie/man/morie_list_datasets.Rd index d5ca94c434..079bf0e61e 100644 --- a/r-package/morie/man/morie_list_datasets.Rd +++ b/r-package/morie/man/morie_list_datasets.Rd @@ -4,10 +4,12 @@ \alias{morie_list_datasets} \title{List all datasets with cache status} \usage{ -morie_list_datasets(db_path = NULL) +morie_list_datasets(db_path = NULL, con = NULL) } \arguments{ -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite/DuckDB file (default backend).} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ A data.frame with columns: key, name, source, survey, year, type, @@ -17,8 +19,5 @@ cached (logical), rows (integer or NA). List all datasets with cache status } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_list_datasets() } diff --git a/r-package/morie/man/list_morie_modules.Rd b/r-package/morie/man/morie_list_morie_modules.Rd similarity index 61% rename from r-package/morie/man/list_morie_modules.Rd rename to r-package/morie/man/morie_list_morie_modules.Rd index 48af60e4e9..684288c0af 100644 --- a/r-package/morie/man/list_morie_modules.Rd +++ b/r-package/morie/man/morie_list_morie_modules.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/modules.R -\name{list_morie_modules} -\alias{list_morie_modules} +\name{morie_list_morie_modules} +\alias{morie_list_morie_modules} \title{List implemented MORIE CPADS modules} \usage{ -list_morie_modules() +morie_list_morie_modules() } \value{ Data frame describing the implemented module surface. @@ -13,8 +13,5 @@ Data frame describing the implemented module surface. List implemented MORIE CPADS modules } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_list_morie_modules() } diff --git a/r-package/morie/man/morie_load_cpads.Rd b/r-package/morie/man/morie_load_cpads.Rd index 16fec4bc30..63d9dafe7a 100644 --- a/r-package/morie/man/morie_load_cpads.Rd +++ b/r-package/morie/man/morie_load_cpads.Rd @@ -4,13 +4,15 @@ \alias{morie_load_cpads} \title{Load CPADS data: local files -> cache -> CKAN API} \usage{ -morie_load_cpads(db_path = NULL, use_ckan = TRUE) +morie_load_cpads(db_path = NULL, use_ckan = TRUE, con = NULL) } \arguments{ -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite/DuckDB file (default backend).} \item{use_ckan}{Logical; if TRUE and data not found locally or in cache, attempt to fetch from the CKAN API.} + +\item{con}{Optional pre-opened DBI connection (overrides \code{db_path}).} } \value{ A data.frame with canonical CPADS columns. @@ -25,9 +27,8 @@ Resolution order: } \examples{ \dontrun{ - # Needs the CPADS PUMF (local file, cache, or a live CKAN fetch), - # so it cannot run inside an offline R CMD check. - cpads <- morie_load_cpads(use_ckan = TRUE) - if (!is.null(cpads)) head(cpads) +# Needs the CPADS PUMF (local file, cache, or a live CKAN fetch). +cpads <- morie_load_cpads(use_ckan = TRUE) +if (!is.null(cpads)) head(cpads) } } diff --git a/r-package/morie/man/morie_load_cpads_data.Rd b/r-package/morie/man/morie_load_cpads_data.Rd new file mode 100644 index 0000000000..ea8045578c --- /dev/null +++ b/r-package/morie/man/morie_load_cpads_data.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/modules.R +\name{morie_load_cpads_data} +\alias{morie_load_cpads_data} +\title{Load the real CPADS CSV from this repository} +\usage{ +morie_load_cpads_data(cpads_csv = .cpads_default_csv()) +} +\arguments{ +\item{cpads_csv}{Path to the CPADS CSV.} +} +\value{ +Canonicalized CPADS data frame. +} +\description{ +Load the real CPADS CSV from this repository +} +\examples{ +\donttest{ +# Reads and canonicalises the CPADS PUMF CSV. The default CSV lives in +# a morie project tree; the CKAN-fetched PUMF works identically (see +# morie_load_dataset("ocp21")). The tryCatch guard lets the example +# render cleanly on machines without the CSV checked out locally. +tryCatch(morie_load_cpads_data(), error = function(e) message(conditionMessage(e))) +} +} diff --git a/r-package/morie/man/morie_load_dataset.Rd b/r-package/morie/man/morie_load_dataset.Rd index e8b9ebc30d..16b328a274 100644 --- a/r-package/morie/man/morie_load_dataset.Rd +++ b/r-package/morie/man/morie_load_dataset.Rd @@ -4,24 +4,42 @@ \alias{morie_load_dataset} \title{Load a dataset by catalog key} \usage{ -morie_load_dataset(key, db_path = NULL) +morie_load_dataset(key, db_path = NULL, refresh = FALSE, con = NULL) } \arguments{ \item{key}{Dataset catalog key (or fuzzy match).} -\item{db_path}{Optional override for the database path.} +\item{db_path}{Optional path to a SQLite/DuckDB file (default backend).} + +\item{refresh}{If \code{TRUE}, bypass the built-in database and the +user cache (and, for remotely-backed datasets, the local file) and +re-fetch from the remote source, overwriting the cached copy. Use +this to pick up time-to-time updates to a dataset.} + +\item{con}{Optional pre-opened DBI connection for the user cache +(overrides \code{db_path}). The built-in DB read is always SQLite-based +and is unaffected by \code{con}.} } \value{ A data.frame. } \description{ -Resolution: SQLite cache -> local file ingest -> CKAN API -> error. -Supports fuzzy matching: \code{morie_load_dataset("cpads_2021")} resolves -to \code{oc_cpads_2021}. +Resolution tiers, tried in order: built-in DB -> user cache -> local +file -> CKAN datastore -> direct download URL -> ArcGIS layer -> +error. Supports fuzzy matching: \code{morie_load_dataset("cpads_2021")} +resolves to \code{ocp21}. } \examples{ \dontrun{ - df <- morie_load_dataset("ocp21") # CPADS 2021-2022 - nrow(df) +df <- morie_load_dataset("ocp21") # CPADS 2021-2022 (default DuckDB cache) +df <- morie_load_dataset("ocp21", refresh = TRUE) # force re-fetch + +# PostgreSQL cache (run a server first): +# con <- DBI::dbConnect(RPostgres::Postgres(), +# host = "localhost", dbname = "morie", user = "...") +# df <- morie_load_dataset("ocp21", con = con) +} } +\seealso{ +\code{\link{morie_fetch}}, \code{\link{morie_ckan_search}} } diff --git a/r-package/morie/man/lstmc_lstm_cell.Rd b/r-package/morie/man/morie_lstmc_lstm_cell.Rd similarity index 87% rename from r-package/morie/man/lstmc_lstm_cell.Rd rename to r-package/morie/man/morie_lstmc_lstm_cell.Rd index 18a9ee4720..8af07d62b5 100644 --- a/r-package/morie/man/lstmc_lstm_cell.Rd +++ b/r-package/morie/man/morie_lstmc_lstm_cell.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/lstmc.R -\name{lstmc_lstm_cell} -\alias{lstmc_lstm_cell} -\alias{lstm_cell} +\name{morie_lstmc_lstm_cell} +\alias{morie_lstmc_lstm_cell} +\alias{morie_lstm_cell} \title{LSTM cell forward pass} \usage{ -lstmc_lstm_cell( +morie_lstmc_lstm_cell( x, h_prev = NULL, c_prev = NULL, @@ -17,7 +17,7 @@ lstmc_lstm_cell( deterministic_seed = NULL ) -lstm_cell( +morie_lstm_cell( x, h_prev = NULL, c_prev = NULL, @@ -58,10 +58,7 @@ R parity for \code{morie.fn.lstmc.lstm_cell}. Gates stacked as h = o \odot \tanh(c)} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_lstmc_lstm_cell(x = rnorm(50)) } \references{ Hochreiter & Schmidhuber (1997), Neural Computation 9(8). diff --git a/r-package/morie/man/morie_mann_whitney_test.Rd b/r-package/morie/man/morie_mann_whitney_test.Rd new file mode 100644 index 0000000000..f7ce151da1 --- /dev/null +++ b/r-package/morie/man/morie_mann_whitney_test.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_mann_whitney_test} +\alias{morie_mann_whitney_test} +\title{Mann-Whitney U test (Wilcoxon rank-sum)} +\usage{ +morie_mann_whitney_test( + x1, + x2, + alternative = c("two.sided", "greater", "less") +) +} +\arguments{ +\item{x1}{Numeric vector (group 1).} + +\item{x2}{Numeric vector (group 2).} + +\item{alternative}{\code{"two.sided"}, \code{"greater"}, or \code{"less"}.} +} +\value{ +Named list: \code{W}, \code{p_value}, \code{r} (effect size). +} +\description{ +Mann-Whitney U test (Wilcoxon rank-sum) +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/marker_variance.Rd b/r-package/morie/man/morie_marker_variance.Rd similarity index 75% rename from r-package/morie/man/marker_variance.Rd rename to r-package/morie/man/morie_marker_variance.Rd index c764b6cc18..8aca51c2ef 100644 --- a/r-package/morie/man/marker_variance.Rd +++ b/r-package/morie/man/morie_marker_variance.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mrkvr.R -\name{marker_variance} -\alias{marker_variance} +\name{morie_marker_variance} +\alias{morie_marker_variance} \title{Marker variance-component estimation} \usage{ -marker_variance(x, y, markers) +morie_marker_variance(x, y, markers) } \arguments{ \item{x}{Fixed-effect design (optional).} @@ -23,10 +23,10 @@ alongside the naive sigma_g^2 / p form. sigma_g^2 is obtained from a quick GBLUP fit. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_marker_variance( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ VanRaden (2008); Montesinos Lopez Ch 3. diff --git a/r-package/morie/man/mhatf_multi_head_attention_full.Rd b/r-package/morie/man/morie_mhatf_multi_head_attention_full.Rd similarity index 81% rename from r-package/morie/man/mhatf_multi_head_attention_full.Rd rename to r-package/morie/man/morie_mhatf_multi_head_attention_full.Rd index becd561ae0..435a0839e7 100644 --- a/r-package/morie/man/mhatf_multi_head_attention_full.Rd +++ b/r-package/morie/man/morie_mhatf_multi_head_attention_full.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mhatf.R -\name{mhatf_multi_head_attention_full} -\alias{mhatf_multi_head_attention_full} -\alias{multi_head_attention_full} +\name{morie_mhatf_multi_head_attention_full} +\alias{morie_mhatf_multi_head_attention_full} +\alias{morie_multi_head_attention_full} \title{Multi-head attention with output projection} \usage{ -mhatf_multi_head_attention_full( +morie_mhatf_multi_head_attention_full( x, num_heads = 2L, W_q = NULL, @@ -16,7 +16,7 @@ mhatf_multi_head_attention_full( deterministic_seed = NULL ) -multi_head_attention_full( +morie_multi_head_attention_full( x, num_heads = 2L, W_q = NULL, @@ -55,10 +55,8 @@ R parity for \code{morie.fn.mhatf.multi_head_attention_full}. \mathrm{Concat}(\mathrm{head}_1, \ldots, \mathrm{head}_h) W^O} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Vaswani et al. (2017), NeurIPS. diff --git a/r-package/morie/man/midas_regression.Rd b/r-package/morie/man/morie_midas_regression.Rd similarity index 74% rename from r-package/morie/man/midas_regression.Rd rename to r-package/morie/man/morie_midas_regression.Rd index 5e1878c502..6992c2a49b 100644 --- a/r-package/morie/man/midas_regression.Rd +++ b/r-package/morie/man/morie_midas_regression.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/midas.R -\name{midas_regression} -\alias{midas_regression} +\name{morie_midas_regression} +\alias{morie_midas_regression} \title{MIDAS regression with Beta-polynomial weights} \usage{ -midas_regression(x, y, K = NULL) +morie_midas_regression(x, y, K = NULL) } \arguments{ \item{x}{High-frequency regressor matrix (n_t x K) or flat vector.} @@ -21,8 +21,6 @@ Named list with \code{beta0, beta1, theta1, theta2, weights, MIDAS regression with Beta-polynomial weights } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/mini_batch_gradient.Rd b/r-package/morie/man/morie_mini_batch_gradient.Rd similarity index 79% rename from r-package/morie/man/mini_batch_gradient.Rd rename to r-package/morie/man/morie_mini_batch_gradient.Rd index 3e37d4fc72..9f6d47486a 100644 --- a/r-package/morie/man/mini_batch_gradient.Rd +++ b/r-package/morie/man/morie_mini_batch_gradient.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mbgrd.R -\name{mini_batch_gradient} -\alias{mini_batch_gradient} +\name{morie_mini_batch_gradient} +\alias{morie_mini_batch_gradient} \title{Mini-batch stochastic gradient descent for OLS (R parity)} \usage{ -mini_batch_gradient( +morie_mini_batch_gradient( x, y, lr = 0.01, @@ -34,8 +34,5 @@ loss, n, method. Mini-batch stochastic gradient descent for OLS (R parity) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_mini_batch_gradient(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/multi_trait_gblup.Rd b/r-package/morie/man/morie_multi_trait_gblup.Rd similarity index 71% rename from r-package/morie/man/multi_trait_gblup.Rd rename to r-package/morie/man/morie_multi_trait_gblup.Rd index d3e6fe15c6..f86d84de2f 100644 --- a/r-package/morie/man/multi_trait_gblup.Rd +++ b/r-package/morie/man/morie_multi_trait_gblup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mtgbl.R -\name{multi_trait_gblup} -\alias{multi_trait_gblup} +\name{morie_multi_trait_gblup} +\alias{morie_multi_trait_gblup} \title{Multi-trait GBLUP via vec-stacked mixed-model equations} \usage{ -multi_trait_gblup(x, y, markers, Sigma_g = NULL, Sigma_e = NULL) +morie_multi_trait_gblup(x, y, markers, Sigma_g = NULL, Sigma_e = NULL) } \arguments{ \item{x}{Fixed-effect design (vector or matrix).} @@ -24,10 +24,10 @@ list(estimate, G_hat, B_hat, Sigma_g, Sigma_e, n, t, method). Multi-trait GBLUP via vec-stacked mixed-model equations } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_multi_trait_gblup( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ Montesinos Lopez Ch 10. diff --git a/r-package/morie/man/morie_mvn_with_covariance.Rd b/r-package/morie/man/morie_mvn_with_covariance.Rd index f299beefdd..03402f9fb0 100644 --- a/r-package/morie/man/morie_mvn_with_covariance.Rd +++ b/r-package/morie/man/morie_mvn_with_covariance.Rd @@ -34,8 +34,6 @@ An n x p matrix of samples. Draw multivariate normal samples under a structured covariance } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/mxpol_maxpool_forward.Rd b/r-package/morie/man/morie_mxpol_maxpool_forward.Rd similarity index 67% rename from r-package/morie/man/mxpol_maxpool_forward.Rd rename to r-package/morie/man/morie_mxpol_maxpool_forward.Rd index dcc9fd240b..d7cb5ced67 100644 --- a/r-package/morie/man/mxpol_maxpool_forward.Rd +++ b/r-package/morie/man/morie_mxpol_maxpool_forward.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mxpol.R -\name{mxpol_maxpool_forward} -\alias{mxpol_maxpool_forward} -\alias{maxpool_forward} +\name{morie_mxpol_maxpool_forward} +\alias{morie_mxpol_maxpool_forward} +\alias{morie_maxpool_forward} \title{Max pooling forward pass (2D, single channel)} \usage{ -mxpol_maxpool_forward(x, kernel_size = 2L, stride = NULL) +morie_mxpol_maxpool_forward(x, kernel_size = 2L, stride = NULL) -maxpool_forward(x, kernel_size = 2L, stride = NULL) +morie_maxpool_forward(x, kernel_size = 2L, stride = NULL) } \arguments{ \item{x}{Numeric matrix \code{(H, W)}.} @@ -26,10 +26,8 @@ R parity for \code{morie.fn.mxpol.maxpool_forward}. \deqn{y[i,j] = \max_{0 \le m,n < k} x[i s + m, j s + n]} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Goodfellow et al. (2016), Deep Learning, Ch 9.3. diff --git a/r-package/morie/man/nbeats_basis.Rd b/r-package/morie/man/morie_nbeats_basis.Rd similarity index 75% rename from r-package/morie/man/nbeats_basis.Rd rename to r-package/morie/man/morie_nbeats_basis.Rd index 37b7b44db2..bf3947aa49 100644 --- a/r-package/morie/man/nbeats_basis.Rd +++ b/r-package/morie/man/morie_nbeats_basis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/nbeat.R -\name{nbeats_basis} -\alias{nbeats_basis} +\name{morie_nbeats_basis} +\alias{morie_nbeats_basis} \title{N-BEATS-style polynomial + Fourier basis-expansion forecasting} \usage{ -nbeats_basis(x, horizon = 1, n_trend = 3, n_season = 5, period = 12) +morie_nbeats_basis(x, horizon = 1, n_trend = 3, n_season = 5, period = 12) } \arguments{ \item{x}{Numeric history.} @@ -25,8 +25,5 @@ Named list with \code{forecast, fitted, trend, seasonal, N-BEATS-style polynomial + Fourier basis-expansion forecasting } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_nbeats_basis(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_odds_ratio_ci.Rd b/r-package/morie/man/morie_odds_ratio_ci.Rd new file mode 100644 index 0000000000..78ba958821 --- /dev/null +++ b/r-package/morie/man/morie_odds_ratio_ci.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_odds_ratio_ci} +\alias{morie_odds_ratio_ci} +\title{Odds ratio and 95\% CI from a 2x2 contingency table} +\usage{ +morie_odds_ratio_ci(table_2x2, alpha = 0.05) +} +\arguments{ +\item{table_2x2}{A 2x2 matrix: rows are treatment, columns are outcome.} + +\item{alpha}{Significance level.} +} +\value{ +Named list: \code{or}, \code{ci_lower}, \code{ci_upper}, \code{p_value}. +} +\description{ +Odds ratio and 95\% CI from a 2x2 contingency table +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_omega_squared.Rd b/r-package/morie/man/morie_omega_squared.Rd new file mode 100644 index 0000000000..68fba8b4ad --- /dev/null +++ b/r-package/morie/man/morie_omega_squared.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_omega_squared} +\alias{morie_omega_squared} +\title{Omega-squared (less biased than eta-squared)} +\usage{ +morie_omega_squared(f_stat, df_between, df_within, n) +} +\arguments{ +\item{f_stat}{F statistic.} + +\item{df_between}{Degrees of freedom (numerator).} + +\item{df_within}{Degrees of freedom (denominator).} + +\item{n}{Total sample size.} +} +\value{ +Numeric omega-squared. +} +\description{ +Omega-squared (less biased than eta-squared) +} +\examples{ +morie_omega_squared(f_stat = 5.2, df_between = 2, df_within = 87, n = 90) +} diff --git a/r-package/morie/man/one_sample_coverage.Rd b/r-package/morie/man/morie_one_sample_coverage.Rd similarity index 74% rename from r-package/morie/man/one_sample_coverage.Rd rename to r-package/morie/man/morie_one_sample_coverage.Rd index 1ee2c37e3d..96cfc81799 100644 --- a/r-package/morie/man/one_sample_coverage.Rd +++ b/r-package/morie/man/morie_one_sample_coverage.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/covsp.R -\name{one_sample_coverage} -\alias{one_sample_coverage} +\name{morie_one_sample_coverage} +\alias{morie_one_sample_coverage} \title{One-sample coverage probability (Gibbons Ch 2.11.1)} \usage{ -one_sample_coverage(x) +morie_one_sample_coverage(x) } \arguments{ \item{x}{Numeric vector.} @@ -19,8 +19,5 @@ are i.i.d. Beta(1, n) under H0. Returns empirical coverages (rank-based) plus the cumulative coverage F(X_(n)) - F(X_(1)). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_one_sample_coverage(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_one_sample_t_test.Rd b/r-package/morie/man/morie_one_sample_t_test.Rd new file mode 100644 index 0000000000..c2fd9d3102 --- /dev/null +++ b/r-package/morie/man/morie_one_sample_t_test.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_one_sample_t_test} +\alias{morie_one_sample_t_test} +\title{One-sample t-test} +\usage{ +morie_one_sample_t_test( + x, + mu0 = 0, + alternative = c("two.sided", "greater", "less") +) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{mu0}{Null hypothesis mean (default 0).} + +\item{alternative}{\code{"two.sided"}, \code{"greater"}, or \code{"less"}.} +} +\value{ +Named list: \code{t}, \code{df}, \code{p_value}, \code{ci}. +} +\description{ +One-sample t-test +} +\examples{ +morie_one_sample_t_test(x = rnorm(50)) +} diff --git a/r-package/morie/man/morie_ordered_alternatives_test.Rd b/r-package/morie/man/morie_ordered_alternatives_test.Rd new file mode 100644 index 0000000000..8f3975411a --- /dev/null +++ b/r-package/morie/man/morie_ordered_alternatives_test.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ordlt_jonckheere.R +\name{morie_ordered_alternatives_test} +\alias{morie_ordered_alternatives_test} +\title{Jonckheere-Terpstra ordered-alternatives test (Gibbons Ch 10.6)} +\usage{ +morie_ordered_alternatives_test(groups) +} +\arguments{ +\item{groups}{List of numeric vectors in monotone hypothesised order.} +} +\value{ +Named list: statistic, p_value, z, E_J, Var_J, n, k, method. +} +\description{ +Tests H0: F_1 = ... = F_k against the ordered alternative +H1: F_1 <= F_2 <= ... <= F_k. J = sum over i 95\% CI).} + +\item{method}{\code{"wilson"} (default), \code{"exact"} (Clopper-Pearson), +or \code{"wald"}.} +} +\value{ +Named list: \code{p_hat}, \code{ci_lower}, \code{ci_upper}. +} +\description{ +Wilson score confidence interval for a proportion +} +\examples{ +morie_proportion_ci(35, 100) +} diff --git a/r-package/morie/man/random_forest_ensemble.Rd b/r-package/morie/man/morie_random_forest_ensemble.Rd similarity index 85% rename from r-package/morie/man/random_forest_ensemble.Rd rename to r-package/morie/man/morie_random_forest_ensemble.Rd index b666695fb3..7136ac68bf 100644 --- a/r-package/morie/man/random_forest_ensemble.Rd +++ b/r-package/morie/man/morie_random_forest_ensemble.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rfens.R -\name{random_forest_ensemble} -\alias{random_forest_ensemble} +\name{morie_random_forest_ensemble} +\alias{morie_random_forest_ensemble} \title{Random Forest ensemble (R parity)} \usage{ -random_forest_ensemble( +morie_random_forest_ensemble( x, y, n_estimators = 100L, @@ -41,8 +41,5 @@ Wraps \code{randomForest::randomForest}. Auto-detects task from y (factor / integer-like -> classification, otherwise regression). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_random_forest_ensemble(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/random_forest_genomic.Rd b/r-package/morie/man/morie_random_forest_genomic.Rd similarity index 79% rename from r-package/morie/man/random_forest_genomic.Rd rename to r-package/morie/man/morie_random_forest_genomic.Rd index 4ea071c1fd..133c6b8f04 100644 --- a/r-package/morie/man/random_forest_genomic.Rd +++ b/r-package/morie/man/morie_random_forest_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rfgen.R -\name{random_forest_genomic} -\alias{random_forest_genomic} +\name{morie_random_forest_genomic} +\alias{morie_random_forest_genomic} \title{Random-forest genomic predictor} \usage{ -random_forest_genomic( +morie_random_forest_genomic( x, y, markers, @@ -40,10 +40,10 @@ Uses randomForest if available; otherwise a base-R bagged-tree fallback (regression CART approximation). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_random_forest_genomic( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ Breiman (2001); Montesinos Lopez Ch 8. diff --git a/r-package/morie/man/random_search_cv.Rd b/r-package/morie/man/morie_random_search_cv.Rd similarity index 86% rename from r-package/morie/man/random_search_cv.Rd rename to r-package/morie/man/morie_random_search_cv.Rd index 27a0f7623b..40a9d58776 100644 --- a/r-package/morie/man/random_search_cv.Rd +++ b/r-package/morie/man/morie_random_search_cv.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rndsr.R -\name{random_search_cv} -\alias{random_search_cv} +\name{morie_random_search_cv} +\alias{morie_random_search_cv} \title{Random search hyperparameter optimisation (R parity)} \usage{ -random_search_cv( +morie_random_search_cv( x, y, method = NULL, @@ -43,8 +43,6 @@ sampled_scores, n_iter, task, n, method. Uses \code{caret::train} with \code{search = "random"}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/rank_based_test.Rd b/r-package/morie/man/morie_rank_based_test.Rd similarity index 71% rename from r-package/morie/man/rank_based_test.Rd rename to r-package/morie/man/morie_rank_based_test.Rd index 9869f04f64..7e01fe787c 100644 --- a/r-package/morie/man/rank_based_test.Rd +++ b/r-package/morie/man/morie_rank_based_test.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rnkbs.R -\name{rank_based_test} -\alias{rank_based_test} +\name{morie_rank_based_test} +\alias{morie_rank_based_test} \title{Mann's rank test for randomness (Gibbons Ch 3.5)} \usage{ -rank_based_test(x) +morie_rank_based_test(x) } \arguments{ \item{x}{Numeric vector of sequential observations.} @@ -17,8 +17,5 @@ Kendall tau between the observation and its time index t = 1..n. Tests H0: no monotone trend. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rank_based_test(x = rnorm(50)) } diff --git a/r-package/morie/man/rank_order_statistics.Rd b/r-package/morie/man/morie_rank_order_statistics.Rd similarity index 71% rename from r-package/morie/man/rank_order_statistics.Rd rename to r-package/morie/man/morie_rank_order_statistics.Rd index a3042b603e..0c3d648750 100644 --- a/r-package/morie/man/rank_order_statistics.Rd +++ b/r-package/morie/man/morie_rank_order_statistics.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rnkor.R -\name{rank_order_statistics} -\alias{rank_order_statistics} +\name{morie_rank_order_statistics} +\alias{morie_rank_order_statistics} \title{Signed ranks of paired differences (Gibbons Ch 5.5)} \usage{ -rank_order_statistics(x, mu0 = 0) +morie_rank_order_statistics(x, mu0 = 0) } \arguments{ \item{x}{Numeric vector of differences (or values; mu0 is subtracted).} @@ -19,8 +19,5 @@ Signed ranks R_i^+ = sign(D_i) * rank(|D_i|) used by Wilcoxon signed-rank. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rank_order_statistics(x = rnorm(50)) } diff --git a/r-package/morie/man/rank_placements.Rd b/r-package/morie/man/morie_rank_placements.Rd similarity index 71% rename from r-package/morie/man/rank_placements.Rd rename to r-package/morie/man/morie_rank_placements.Rd index d68336417c..7ef63a4a75 100644 --- a/r-package/morie/man/rank_placements.Rd +++ b/r-package/morie/man/morie_rank_placements.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plcmt.R -\name{rank_placements} -\alias{rank_placements} +\name{morie_rank_placements} +\alias{morie_rank_placements} \title{Rank placements of Y among X order statistics (Gibbons Ch 2.11.3)} \usage{ -rank_placements(x, y) +morie_rank_placements(x, y) } \arguments{ \item{x, y}{Numeric vectors.} @@ -17,8 +17,5 @@ For each Y_j: placement P_j = number of X_i less than Y_j. Their sum is the Mann-Whitney U statistic for Y vs X. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rank_placements(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_read_outputs_manifest.Rd b/r-package/morie/man/morie_read_outputs_manifest.Rd new file mode 100644 index 0000000000..ad91c0fdee --- /dev/null +++ b/r-package/morie/man/morie_read_outputs_manifest.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{morie_read_outputs_manifest} +\alias{morie_read_outputs_manifest} +\title{Read outputs manifest from a project} +\usage{ +morie_read_outputs_manifest( + project_root = NULL, + manifest_path = NULL, + validate = TRUE +) +} +\arguments{ +\item{project_root}{Project root path.} + +\item{manifest_path}{Optional explicit manifest path.} + +\item{validate}{If \code{TRUE}, validate schema.} +} +\value{ +Manifest data frame. +} +\description{ +Read outputs manifest from a project +} +\examples{ +# Craft a minimal manifest in tempdir and read it back: +tdir <- tempfile("morie-doc-") +dir.create(tdir) +man <- file.path(tdir, "outputs_manifest.csv") +write.csv( + data.frame( + output = "results.csv", + public_path = file.path(tdir, "results.csv"), + size_kb = 0.01, modified = format(Sys.Date()) + ), + man, + row.names = FALSE +) +writeLines("x,y\n1,2", file.path(tdir, "results.csv")) +morie_read_outputs_manifest(manifest_path = man) +} diff --git a/r-package/morie/man/regime_switching.Rd b/r-package/morie/man/morie_regime_switching.Rd similarity index 73% rename from r-package/morie/man/regime_switching.Rd rename to r-package/morie/man/morie_regime_switching.Rd index 3a824f49a0..70c7309b6b 100644 --- a/r-package/morie/man/regime_switching.Rd +++ b/r-package/morie/man/morie_regime_switching.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/regms.R -\name{regime_switching} -\alias{regime_switching} +\name{morie_regime_switching} +\alias{morie_regime_switching} \title{Markov-switching regression (Hamilton 1989)} \usage{ -regime_switching(x, k_regimes = 2) +morie_regime_switching(x, k_regimes = 2) } \arguments{ \item{x}{Numeric univariate series.} @@ -20,8 +20,5 @@ Fit a constant-mean, switching-variance K-regime Markov-switching model by EM (Hamilton filter). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_regime_switching(x = rnorm(50)) } diff --git a/r-package/morie/man/regularization_path.Rd b/r-package/morie/man/morie_regularization_path.Rd similarity index 81% rename from r-package/morie/man/regularization_path.Rd rename to r-package/morie/man/morie_regularization_path.Rd index aa214b12ef..94422af948 100644 --- a/r-package/morie/man/regularization_path.Rd +++ b/r-package/morie/man/morie_regularization_path.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rgztn.R -\name{regularization_path} -\alias{regularization_path} +\name{morie_regularization_path} +\alias{morie_regularization_path} \title{Ridge / LASSO / ElasticNet regularization path (R parity)} \usage{ -regularization_path( +morie_regularization_path( x, y, penalty = c("ridge", "lasso", "elasticnet"), @@ -32,8 +32,6 @@ Wraps \code{glmnet::glmnet}. Returns the coefficient path across the supplied \code{alphas} (lambda grid in glmnet terminology). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/morie_risk_difference_ci.Rd b/r-package/morie/man/morie_risk_difference_ci.Rd new file mode 100644 index 0000000000..2d22e23d34 --- /dev/null +++ b/r-package/morie/man/morie_risk_difference_ci.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_risk_difference_ci} +\alias{morie_risk_difference_ci} +\title{Risk difference (ARD) with Newcombe CI} +\usage{ +morie_risk_difference_ci(table_2x2, alpha = 0.05) +} +\arguments{ +\item{table_2x2}{A 2x2 matrix: rows are exposure, columns are outcome.} + +\item{alpha}{Significance level.} +} +\value{ +Named list: \code{rd}, \code{ci_lower}, \code{ci_upper}. +} +\description{ +Risk difference (ARD) with Newcombe CI +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_risk_ratio_ci.Rd b/r-package/morie/man/morie_risk_ratio_ci.Rd new file mode 100644 index 0000000000..a9c44fa191 --- /dev/null +++ b/r-package/morie/man/morie_risk_ratio_ci.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_risk_ratio_ci} +\alias{morie_risk_ratio_ci} +\title{Risk ratio (relative risk) with log-normal CI} +\usage{ +morie_risk_ratio_ci(table_2x2, alpha = 0.05) +} +\arguments{ +\item{table_2x2}{A 2x2 matrix: rows are exposure, columns are outcome (disease = col 1).} + +\item{alpha}{Significance level.} +} +\value{ +Named list: \code{rr}, \code{ci_lower}, \code{ci_upper}. +} +\description{ +Risk ratio (relative risk) with log-normal CI +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/rkhs_full.Rd b/r-package/morie/man/morie_rkhs_full.Rd similarity index 74% rename from r-package/morie/man/rkhs_full.Rd rename to r-package/morie/man/morie_rkhs_full.Rd index 503dd2933d..6882467f81 100644 --- a/r-package/morie/man/rkhs_full.Rd +++ b/r-package/morie/man/morie_rkhs_full.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rkhsf.R -\name{rkhs_full} -\alias{rkhs_full} +\name{morie_rkhs_full} +\alias{morie_rkhs_full} \title{RKHS regression with Gaussian kernel} \usage{ -rkhs_full(x, y, markers, h = NULL, lam = 1) +morie_rkhs_full(x, y, markers, h = NULL, lam = 1) } \arguments{ \item{x}{Fixed-effect design.} @@ -24,10 +24,7 @@ list(estimate, alpha, beta, K, f_hat, se, h, n, method). RKHS regression with Gaussian kernel } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rkhs_full(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ Gianola & van Kaam (2008). Montesinos Lopez Ch 5. diff --git a/r-package/morie/man/rnn_genomic.Rd b/r-package/morie/man/morie_rnn_genomic.Rd similarity index 84% rename from r-package/morie/man/rnn_genomic.Rd rename to r-package/morie/man/morie_rnn_genomic.Rd index 0b7ec2d8c1..2c566a749f 100644 --- a/r-package/morie/man/rnn_genomic.Rd +++ b/r-package/morie/man/morie_rnn_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rnnge.R -\name{rnn_genomic} -\alias{rnn_genomic} +\name{morie_rnn_genomic} +\alias{morie_rnn_genomic} \title{Vanilla RNN genomic predictor (BPTT, base R)} \usage{ -rnn_genomic( +morie_rnn_genomic( x, y, markers, @@ -37,10 +37,7 @@ list(estimate, y_hat, W_h, W_x, b_h, w_o, b_o, se, n, method). Vanilla RNN genomic predictor (BPTT, base R) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rnn_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ Montesinos Lopez Ch 14. diff --git a/r-package/morie/man/roc_auc_score.Rd b/r-package/morie/man/morie_roc_auc_score.Rd similarity index 68% rename from r-package/morie/man/roc_auc_score.Rd rename to r-package/morie/man/morie_roc_auc_score.Rd index e2ae0f43a4..64a434c9ec 100644 --- a/r-package/morie/man/roc_auc_score.Rd +++ b/r-package/morie/man/morie_roc_auc_score.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rocau.R -\name{roc_auc_score} -\alias{roc_auc_score} +\name{morie_roc_auc_score} +\alias{morie_roc_auc_score} \title{ROC curve and AUC (R parity)} \usage{ -roc_auc_score(y_true, y_score) +morie_roc_auc_score(y_true, y_score) } \arguments{ \item{y_true}{Binary labels.} @@ -19,8 +19,6 @@ n_positive, n_negative, method. Wraps \code{pROC::roc}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/rotrp_rotary_position_embedding.Rd b/r-package/morie/man/morie_rotrp_rotary_position_embedding.Rd similarity index 64% rename from r-package/morie/man/rotrp_rotary_position_embedding.Rd rename to r-package/morie/man/morie_rotrp_rotary_position_embedding.Rd index 37d31d1dea..000e5379d1 100644 --- a/r-package/morie/man/rotrp_rotary_position_embedding.Rd +++ b/r-package/morie/man/morie_rotrp_rotary_position_embedding.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rotrp.R -\name{rotrp_rotary_position_embedding} -\alias{rotrp_rotary_position_embedding} -\alias{rotary_position_embedding} +\name{morie_rotrp_rotary_position_embedding} +\alias{morie_rotrp_rotary_position_embedding} +\alias{morie_rotary_position_embedding} \title{Rotary position embedding (RoPE)} \usage{ -rotrp_rotary_position_embedding(x, base = 10000) +morie_rotrp_rotary_position_embedding(x, base = 10000) -rotary_position_embedding(x, base = 10000) +morie_rotary_position_embedding(x, base = 10000) } \arguments{ \item{x}{Numeric matrix \code{(seq_len, d_model)}, \code{d_model} even.} @@ -25,10 +25,8 @@ Each pair (2i, 2i+1) is rotated by \eqn{\theta_{pos, i} = pos / N^{2i/d}}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Su et al. (2021), arXiv:2104.09864. diff --git a/r-package/morie/man/rslnk_residual_connection.Rd b/r-package/morie/man/morie_rslnk_residual_connection.Rd similarity index 65% rename from r-package/morie/man/rslnk_residual_connection.Rd rename to r-package/morie/man/morie_rslnk_residual_connection.Rd index c223146c40..01d3c64711 100644 --- a/r-package/morie/man/rslnk_residual_connection.Rd +++ b/r-package/morie/man/morie_rslnk_residual_connection.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rslnk.R -\name{rslnk_residual_connection} -\alias{rslnk_residual_connection} -\alias{residual_connection} +\name{morie_rslnk_residual_connection} +\alias{morie_rslnk_residual_connection} +\alias{morie_residual_connection} \title{Residual / skip connection} \usage{ -rslnk_residual_connection(x, f = NULL) +morie_rslnk_residual_connection(x, f = NULL) -residual_connection(x, f = NULL) +morie_residual_connection(x, f = NULL) } \arguments{ \item{x}{Numeric array.} @@ -24,10 +24,7 @@ R parity for \code{morie.fn.rslnk.residual_connection}. \deqn{y = \mathcal{F}(x) + x} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_rslnk_residual_connection(x = rnorm(50)) } \references{ He, Zhang, Ren & Sun (2016), CVPR. diff --git a/r-package/morie/man/morie_run_ebac_selection_ipw_analysis.Rd b/r-package/morie/man/morie_run_ebac_selection_ipw_analysis.Rd new file mode 100644 index 0000000000..66d4723a67 --- /dev/null +++ b/r-package/morie/man/morie_run_ebac_selection_ipw_analysis.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipw.R +\name{morie_run_ebac_selection_ipw_analysis} +\alias{morie_run_ebac_selection_ipw_analysis} +\title{Run the eBAC selection-adjusted IPW workflow} +\usage{ +morie_run_ebac_selection_ipw_analysis( + data, + output_dir = NULL, + treatment = "cannabis_any_use", + covariates = c("age_group", "gender", "province_region", "mental_health", + "physical_health") +) +} +\arguments{ +\item{data}{Analysis data frame.} + +\item{output_dir}{Optional directory for CSV outputs.} + +\item{treatment}{Treatment column name.} + +\item{covariates}{Covariate names used in the observation model.} +} +\value{ +Named list of output tables and the observed-domain analysis frame. +} +\description{ +Mirrors the core outputs of the old \verb{07_ebac_ipw.R} workflow. +} +\examples{ +# Run on a synthetic CPADS-shaped frame (the CKAN-fetched PUMF works +# identically -- see morie_load_cpads_data() for the real frame): +if (requireNamespace("survey", quietly = TRUE)) { + set.seed(1) + n <- 200 + cpads <- data.frame( + weight = runif(n, 0.5, 2), + alcohol_past12m = rbinom(n, 1, 0.8), + heavy_drinking_30d = rbinom(n, 1, 0.3), + ebac_tot = abs(rnorm(n, 0.05, 0.03)), + ebac_legal = rbinom(n, 1, 0.7), + cannabis_any_use = rbinom(n, 1, 0.3), + age_group = sample(1:6, n, TRUE), + gender = sample(1:2, n, TRUE), + province_region = sample(1:5, n, TRUE), + mental_health = sample(1:5, n, TRUE), + physical_health = sample(1:5, n, TRUE) + ) + morie_run_ebac_selection_ipw_analysis(cpads) +} +} diff --git a/r-package/morie/man/run_morie_module.Rd b/r-package/morie/man/morie_run_morie_module.Rd similarity index 51% rename from r-package/morie/man/run_morie_module.Rd rename to r-package/morie/man/morie_run_morie_module.Rd index a38cfc06ad..06ba8a1cd2 100644 --- a/r-package/morie/man/run_morie_module.Rd +++ b/r-package/morie/man/morie_run_morie_module.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/modules.R -\name{run_morie_module} -\alias{run_morie_module} +\name{morie_run_morie_module} +\alias{morie_run_morie_module} \title{Run one implemented MORIE module against CPADS data} \usage{ -run_morie_module( +morie_run_morie_module( module_name, cpads_csv = .cpads_default_csv(), output_dir = NULL @@ -24,8 +24,14 @@ Named list of data-frame outputs. Run one implemented MORIE module against CPADS data } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") +\donttest{ +# Dispatch one MORIE module against the canonical CPADS CSV. The CSV +# ships with a morie project tree, or is fetched via the CKAN endpoint +# (morie_load_dataset("ocp21")). Wrapped in tryCatch so the example +# documents usage even when the CSV is not checked out locally. +tryCatch( + morie_run_morie_module("descriptive-statistics"), + error = function(e) message(conditionMessage(e)) +) } } diff --git a/r-package/morie/man/run_morie_modules.Rd b/r-package/morie/man/morie_run_morie_modules.Rd similarity index 68% rename from r-package/morie/man/run_morie_modules.Rd rename to r-package/morie/man/morie_run_morie_modules.Rd index bed7f837a0..4ea2cd09be 100644 --- a/r-package/morie/man/run_morie_modules.Rd +++ b/r-package/morie/man/morie_run_morie_modules.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/modules.R -\name{run_morie_modules} -\alias{run_morie_modules} +\name{morie_run_morie_modules} +\alias{morie_run_morie_modules} \title{Run multiple implemented MORIE modules} \usage{ -run_morie_modules( - modules = list_morie_modules()$name, +morie_run_morie_modules( + modules = morie_list_morie_modules()$name, cpads_csv = .cpads_default_csv(), output_dir = NULL ) @@ -24,8 +24,6 @@ Named list of module outputs. Run multiple implemented MORIE modules } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/morie_run_pipeline.Rd b/r-package/morie/man/morie_run_pipeline.Rd new file mode 100644 index 0000000000..f943b330e3 --- /dev/null +++ b/r-package/morie/man/morie_run_pipeline.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/workflow.R +\name{morie_run_pipeline} +\alias{morie_run_pipeline} +\title{Run multiple workflow steps} +\usage{ +morie_run_pipeline( + steps = NULL, + project_root = NULL, + script_map = morie_default_workflow_map(), + stop_on_error = TRUE, + verbose = TRUE +) +} +\arguments{ +\item{steps}{Ordered vector of workflow step names.} + +\item{project_root}{Project root directory.} + +\item{script_map}{Named character vector mapping steps to script paths.} + +\item{stop_on_error}{If \code{TRUE}, stop at first failure.} + +\item{verbose}{If \code{TRUE}, streams command output.} +} +\value{ +Data frame of step statuses. +} +\description{ +Run multiple workflow steps +} +\examples{ +# Build a one-step pipeline in tempdir and dispatch it. The +# real package's morie_default_workflow_map() points at scripts that +# live in a morie project tree. +tdir <- tempfile("morie-doc-") +dir.create(tdir) +step <- file.path(tdir, "step.R") +writeLines('cat("hello from pipeline\\\\n")', step) +morie_run_pipeline( + steps = "demo", + project_root = tdir, + script_map = c(demo = step), + verbose = FALSE +) +} diff --git a/r-package/morie/man/morie_run_propensity_ipw_analysis.Rd b/r-package/morie/man/morie_run_propensity_ipw_analysis.Rd new file mode 100644 index 0000000000..b9b5c8433d --- /dev/null +++ b/r-package/morie/man/morie_run_propensity_ipw_analysis.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipw.R +\name{morie_run_propensity_ipw_analysis} +\alias{morie_run_propensity_ipw_analysis} +\title{Run the CPADS propensity/IPW workflow} +\usage{ +morie_run_propensity_ipw_analysis( + data, + output_dir = NULL, + trim = c(0.01, 0.99), + treatment = "cannabis_any_use", + outcome = "heavy_drinking_30d", + covariates = c("age_group", "gender", "province_region", "mental_health", + "physical_health") +) +} +\arguments{ +\item{data}{Analysis data frame.} + +\item{output_dir}{Optional directory for CSV outputs.} + +\item{trim}{Quantile pair used to trim extreme IPW values.} + +\item{treatment}{Binary treatment column.} + +\item{outcome}{Binary outcome column.} + +\item{covariates}{Covariate names for the propensity model.} +} +\value{ +Named list of output tables and the analysis data. +} +\description{ +Mirrors the core outputs of the old \verb{07_propensity.R} workflow. +} +\examples{ +# Run on a synthetic CPADS-shaped frame (the CKAN-fetched PUMF works +# identically -- see morie_load_cpads_data() for the real frame): +set.seed(1) +n <- 200 +cpads <- data.frame( + weight = runif(n, 0.5, 2), + alcohol_past12m = rbinom(n, 1, 0.8), + heavy_drinking_30d = rbinom(n, 1, 0.3), + ebac_tot = abs(rnorm(n, 0.05, 0.03)), + ebac_legal = rbinom(n, 1, 0.7), + cannabis_any_use = rbinom(n, 1, 0.3), + age_group = sample(1:6, n, TRUE), + gender = sample(1:2, n, TRUE), + province_region = sample(1:5, n, TRUE), + mental_health = sample(1:5, n, TRUE), + physical_health = sample(1:5, n, TRUE) +) +morie_run_propensity_ipw_analysis(cpads) +} diff --git a/r-package/morie/man/run_treatment_effects_analysis.Rd b/r-package/morie/man/morie_run_treatment_effects_analysis.Rd similarity index 66% rename from r-package/morie/man/run_treatment_effects_analysis.Rd rename to r-package/morie/man/morie_run_treatment_effects_analysis.Rd index 3fa4e7304d..3eb185cbe6 100644 --- a/r-package/morie/man/run_treatment_effects_analysis.Rd +++ b/r-package/morie/man/morie_run_treatment_effects_analysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/investigation.R -\name{run_treatment_effects_analysis} -\alias{run_treatment_effects_analysis} +\name{morie_run_treatment_effects_analysis} +\alias{morie_run_treatment_effects_analysis} \title{Run a treatment-effects analysis (point estimate, SE, 95\% CI)} \usage{ -run_treatment_effects_analysis(data, treatment, outcome, covariates) +morie_run_treatment_effects_analysis(data, treatment, outcome, covariates) } \arguments{ \item{data}{A \code{data.frame}.} @@ -20,7 +20,7 @@ A list with \code{ate}, \code{se}, \code{ci_lower}, \code{ci_upper}, \code{n}, \ } \description{ Mirrors the Python \code{morie.run_treatment_effects_analysis()}. Convenience -wrapper around \code{\link[=estimate_ate]{estimate_ate()}} that also produces a 95\% confidence +wrapper around \code{\link[=morie_estimate_ate]{morie_estimate_ate()}} that also produces a 95\% confidence interval (delta-method approximation). } \examples{ @@ -30,6 +30,7 @@ df <- data.frame( t = rbinom(200, 1, 0.5), x1 = rnorm(200), x2 = rnorm(200) ) -run_treatment_effects_analysis(df, - treatment = "t", outcome = "y", covariates = c("x1", "x2")) +morie_run_treatment_effects_analysis(df, + treatment = "t", outcome = "y", covariates = c("x1", "x2") +) } diff --git a/r-package/morie/man/run_weighted_logistic_analysis.Rd b/r-package/morie/man/morie_run_weighted_logistic_analysis.Rd similarity index 76% rename from r-package/morie/man/run_weighted_logistic_analysis.Rd rename to r-package/morie/man/morie_run_weighted_logistic_analysis.Rd index cb9106e3c7..a4e1c9f64d 100644 --- a/r-package/morie/man/run_weighted_logistic_analysis.Rd +++ b/r-package/morie/man/morie_run_weighted_logistic_analysis.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/investigation.R -\name{run_weighted_logistic_analysis} -\alias{run_weighted_logistic_analysis} +\name{morie_run_weighted_logistic_analysis} +\alias{morie_run_weighted_logistic_analysis} \title{Run a weighted logistic-regression analysis} \usage{ -run_weighted_logistic_analysis(data, outcome, predictors, weights_col = NULL) +morie_run_weighted_logistic_analysis( + data, + outcome, + predictors, + weights_col = NULL +) } \arguments{ \item{data}{A \code{data.frame} containing outcome, predictors, and (optionally) @@ -32,8 +37,9 @@ df <- data.frame( y = rbinom(200, 1, 0.4), x1 = rnorm(200), x2 = rnorm(200), - w = runif(200, 0.5, 1.5) + w = runif(200, 0.5, 1.5) +) +morie_run_weighted_logistic_analysis(df, + outcome = "y", predictors = c("x1", "x2"), weights_col = "w" ) -run_weighted_logistic_analysis(df, - outcome = "y", predictors = c("x1", "x2"), weights_col = "w") } diff --git a/r-package/morie/man/morie_run_workflow_step.Rd b/r-package/morie/man/morie_run_workflow_step.Rd new file mode 100644 index 0000000000..c29a1b5b22 --- /dev/null +++ b/r-package/morie/man/morie_run_workflow_step.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/workflow.R +\name{morie_run_workflow_step} +\alias{morie_run_workflow_step} +\title{Run one project workflow step} +\usage{ +morie_run_workflow_step( + step, + project_root = NULL, + script_map = morie_default_workflow_map(), + rscript_bin = file.path(R.home("bin"), "Rscript"), + verbose = TRUE +) +} +\arguments{ +\item{step}{Step name present in \code{script_map}.} + +\item{project_root}{Project root directory.} + +\item{script_map}{Named character vector mapping steps to script paths.} + +\item{rscript_bin}{Optional path to \code{Rscript} binary.} + +\item{verbose}{If \code{TRUE}, streams command output.} +} +\value{ +Named list with step metadata and exit status. +} +\description{ +Run one project workflow step +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_sample.Rd b/r-package/morie/man/morie_sample.Rd index 6960006d11..23296c059b 100644 --- a/r-package/morie/man/morie_sample.Rd +++ b/r-package/morie/man/morie_sample.Rd @@ -1,11 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aaa_helpers_samples.R, R/mrm_samples.R +% Please edit documentation in R/mrm_samples.R \name{morie_sample} \alias{morie_sample} -\title{Load a bundled reference sample CSV by name} +\title{Load a bundled MORIE reference sample by name} \usage{ -morie_sample(name = c("otis_b01", "otis_b09", "otis_c11", "tps_assault")) - morie_sample(name = c("otis_b01", "otis_b09", "otis_c11", "tps_assault")) } \arguments{ @@ -13,21 +11,14 @@ morie_sample(name = c("otis_b01", "otis_b09", "otis_c11", "tps_assault")) \code{"tps_assault"}.} } \value{ -A \code{data.frame} loaded from the bundled CSV. - A data.frame. } \description{ -Parity with the Python \code{morie.load_sample()} helper. Reads a -small reference dataset shipped under \code{inst/extdata/samples/}. - Returns a small CSV that ships with the package, suitable for running examples and tests of the \verb{mrm_*()} callables without any network or external data dependency. } \examples{ -df <- morie_sample("otis_b01") -head(df) b01 <- morie_sample("otis_b01") head(b01) } diff --git a/r-package/morie/man/morie_sample_size_logistic.Rd b/r-package/morie/man/morie_sample_size_logistic.Rd new file mode 100644 index 0000000000..73dadb9a00 --- /dev/null +++ b/r-package/morie/man/morie_sample_size_logistic.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_sample_size_logistic} +\alias{morie_sample_size_logistic} +\title{Sample size for logistic regression detecting a target odds ratio} +\usage{ +morie_sample_size_logistic(p0, or, alpha = 0.05, power = 0.8, two_sided = TRUE) +} +\arguments{ +\item{p0}{Prevalence under control.} + +\item{or}{Target odds ratio.} + +\item{alpha}{Significance level.} + +\item{power}{Desired power.} + +\item{two_sided}{Logical.} +} +\value{ +Integer sample size. +} +\description{ +Uses the formula from Hsieh et al. (1998): +\deqn{n = \frac{(z_{\alpha/2} + z_\beta)^2}{p_1(1-p_1) [\log(OR)]^2}} +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} +\references{ +Hsieh FY, Bloch DA, Larsen MD (1998). A simple method of sample size +calculation for linear and logistic regression. +\emph{Statistics in Medicine}, 17(14):1623-1634. +} diff --git a/r-package/morie/man/morie_sensitivity_rosenbaum.Rd b/r-package/morie/man/morie_sensitivity_rosenbaum.Rd new file mode 100644 index 0000000000..98ed50e055 --- /dev/null +++ b/r-package/morie/man/morie_sensitivity_rosenbaum.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/causal.R +\name{morie_sensitivity_rosenbaum} +\alias{morie_sensitivity_rosenbaum} +\title{Rosenbaum bounds sensitivity analysis} +\usage{ +morie_sensitivity_rosenbaum( + treated, + control, + gamma_range = seq(1, 3, by = 0.2) +) +} +\arguments{ +\item{treated}{Numeric vector of outcomes for treated units.} + +\item{control}{Numeric vector of outcomes for control units +(may differ in length from \code{treated} for unmatched designs).} + +\item{gamma_range}{Numeric vector of \eqn{\Gamma} values to test.} +} +\value{ +Data frame with columns: \code{gamma}, \code{p_lower}, \code{p_upper}. +} +\description{ +For a range of hidden-confounding levels \eqn{\Gamma}, tests whether +the treatment effect remains significant. A large \eqn{\Gamma} at +which the result remains significant indicates robustness. +} +\details{ +Uses Wilcoxon signed-rank statistic bounds for matched designs. +For unmatched data, computes sign-score bounds. +} +\examples{ +morie_sensitivity_rosenbaum(treated = rnorm(30, 0.5), control = rnorm(30)) +} +\references{ +Rosenbaum PR (2002). \emph{Observational Studies} (2nd ed.). Springer. +} diff --git a/r-package/morie/man/sgolay_smooth.Rd b/r-package/morie/man/morie_sgolay_smooth.Rd similarity index 80% rename from r-package/morie/man/sgolay_smooth.Rd rename to r-package/morie/man/morie_sgolay_smooth.Rd index fd409eac66..d997869ff9 100644 --- a/r-package/morie/man/sgolay_smooth.Rd +++ b/r-package/morie/man/morie_sgolay_smooth.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/signal.R -\name{sgolay_smooth} -\alias{sgolay_smooth} +\name{morie_sgolay_smooth} +\alias{morie_sgolay_smooth} \title{Savitzky-Golay smoothing filter} \usage{ -sgolay_smooth(x, window_length = 11L, polyorder = 3L) +morie_sgolay_smooth(x, window_length = 11L, polyorder = 3L) } \arguments{ \item{x}{Numeric vector.} @@ -27,7 +27,7 @@ if (requireNamespace("signal", quietly = TRUE)) { set.seed(1) t <- seq(0, 1, length.out = 200) x <- sin(2 * pi * 3 * t) + rnorm(200, sd = 0.2) - y <- sgolay_smooth(x, window_length = 11, polyorder = 3) + y <- morie_sgolay_smooth(x, window_length = 11, polyorder = 3) length(y$filtered) } } diff --git a/r-package/morie/man/morie_shapiro_wilk_test.Rd b/r-package/morie/man/morie_shapiro_wilk_test.Rd new file mode 100644 index 0000000000..da78d9be1a --- /dev/null +++ b/r-package/morie/man/morie_shapiro_wilk_test.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_shapiro_wilk_test} +\alias{morie_shapiro_wilk_test} +\title{Shapiro-Wilk normality test} +\usage{ +morie_shapiro_wilk_test(x, alpha = 0.05) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{alpha}{Significance level for the \code{is_normal} flag (default 0.05).} +} +\value{ +Named list: \code{W}, \code{p_value}, \code{is_normal}. +} +\description{ +Shapiro-Wilk normality test +} +\examples{ +morie_shapiro_wilk_test(x = rnorm(50)) +} diff --git a/r-package/morie/man/sign_test_power.Rd b/r-package/morie/man/morie_sign_test_power.Rd similarity index 74% rename from r-package/morie/man/sign_test_power.Rd rename to r-package/morie/man/morie_sign_test_power.Rd index ae40896e20..7240442be2 100644 --- a/r-package/morie/man/sign_test_power.Rd +++ b/r-package/morie/man/morie_sign_test_power.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/sgnpw.R -\name{sign_test_power} -\alias{sign_test_power} +\name{morie_sign_test_power} +\alias{morie_sign_test_power} \title{Power function of the two-sided sign test (Gibbons Ch 5.4.4)} \usage{ -sign_test_power(x, mu0 = 0, p_alt = 0.7, alpha = 0.05) +morie_sign_test_power(x, mu0 = 0, p_alt = 0.7, alpha = 0.05) } \arguments{ \item{x}{Numeric vector (only \code{length(x != mu0)} is used).} @@ -24,8 +24,5 @@ Builds the discrete rejection region under H0: p = 0.5 with size <= alpha, then evaluates power at the alternative p_alt. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_sign_test_power(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_simple_random_sample.Rd b/r-package/morie/man/morie_simple_random_sample.Rd new file mode 100644 index 0000000000..858c70adcb --- /dev/null +++ b/r-package/morie/man/morie_simple_random_sample.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_simple_random_sample} +\alias{morie_simple_random_sample} +\title{Simple random sample from a data frame} +\usage{ +morie_simple_random_sample(df, n, replace = FALSE, seed = 42L) +} +\arguments{ +\item{df}{A data frame.} + +\item{n}{Number of units to select.} + +\item{replace}{Sample with replacement? Default \code{FALSE}.} + +\item{seed}{Random seed for reproducibility.} +} +\value{ +A data frame of \code{n} sampled rows with a \code{.weight} column added. +} +\description{ +Simple random sample from a data frame +} +\examples{ +df <- data.frame(x = 1:100) +srs_sample <- morie_simple_random_sample(df, 20) +} diff --git a/r-package/morie/man/morie_siu_anomaly_check.Rd b/r-package/morie/man/morie_siu_anomaly_check.Rd new file mode 100644 index 0000000000..18324488f8 --- /dev/null +++ b/r-package/morie/man/morie_siu_anomaly_check.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_anomaly_check} +\alias{morie_siu_anomaly_check} +\title{Per-field anomaly check: does the parser's extraction match the HTML?} +\usage{ +morie_siu_anomaly_check( + case_number, + model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + mock_response_text = NULL +) +} +\arguments{ +\item{case_number}{An SIU case number (e.g. \code{"17-OVI-201"}).} + +\item{model}{One of \code{"ollama"} (default; free, runs locally, +zero-config when an Ollama daemon is on \code{localhost:11434}), +\code{"gemini"} (paid), or \code{"claude"} (paid). A character +vector enables fail-over: the first model whose call succeeds +wins. The default \code{c("ollama", "gemini")} tries the local +free model first and only escalates to paid Gemini if Ollama +isn't installed or fails -- so morie costs $0 to use as long +as you have a free Gemma / Qwen / Llama running locally +(e.g. \code{ollama pull gemma3:4b}).} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and +the optional \code{html/} subdirectory.} + +\item{max_html_chars}{Soft cap on the HTML payload sent to the +model (default 80,000 -- larger than any real SIU report, +small enough to stay under typical context budgets).} + +\item{mock_response_text}{For testing only: if non-NULL, skip the +network call and use this string as the model's raw reply.} +} +\value{ +A data frame with one row per populated parser field: +\code{field}, \code{parser_value}, \code{verdict} (one of +\code{"agree"} / \code{"disagree"} / \code{"unclear"}), and +\code{reason} (a short sentence pointing to the report passage). +} +\description{ +For each populated field in the parser's row, ask the LLM whether +the extracted value is supported by the cached report HTML. Used +to surface fields where the regex parser is plausibly wrong -- +the LLM's verdicts are not authoritative, just an automated way +to triage which rows a human should re-read against the HTML. +} +\details{ +One API call is made per case (all fields batched into a single +prompt with structured-JSON output). +} +\examples{ +\dontrun{ +Sys.setenv(GOOGLE_API_KEY = "your-gemini-key") +a <- morie_siu_anomaly_check("17-OVI-201", model = "gemini") +subset(a, verdict == "disagree") +} +} diff --git a/r-package/morie/man/morie_siu_audit_case.Rd b/r-package/morie/man/morie_siu_audit_case.Rd new file mode 100644 index 0000000000..423cd0a44f --- /dev/null +++ b/r-package/morie/man/morie_siu_audit_case.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_audit_case} +\alias{morie_siu_audit_case} +\title{Audit one SIU case end-to-end: parser output + raw HTML} +\usage{ +morie_siu_audit_case( + case_number, + cache_dir = file.path(tempdir(), "morie", "siu"), + fetch_if_missing = TRUE +) +} +\arguments{ +\item{case_number}{An SIU case number (e.g. \code{"17-OVI-201"}), +or an integer drid.} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and the +optional \code{html/} subdirectory.} + +\item{fetch_if_missing}{If \code{TRUE} (default), fetch the page +from SIU when the local cache misses. Set \code{FALSE} to work +strictly from the cache.} +} +\value{ +A list with elements \code{row} (the parser's 1-row data +frame for this case), \code{drid}, \code{nrid}, +\code{report_html}, \code{news_html}, \code{report_text} +(HTML-stripped plain text of the report) and \code{news_text}. +} +\description{ +For any case_number (or drid), return the parser's 64-column row +together with the raw HTML pages it was extracted from -- the +director's-report page and, when linked, the news-release page. +This is the per-row ground truth: every field in the emitted CSV +is reproducible from \code{report_html} via the parser, and any +disagreement with another data source can be adjudicated against +the saved HTML. +} +\details{ +Reads from the local cache at \code{/html/} (populated +by \code{morie_fetch_siu(cache_html = TRUE)}) when available, and +falls back to a polite live fetch when the cache is missing. +} +\examples{ +\dontrun{ +a <- morie_siu_audit_case( + "17-OVI-201", + cache_dir = file.path(tempdir(), "morie", "siu") +) +cat(substr(a$report_text, 1, 1000), "\n") +} +} diff --git a/r-package/morie/man/morie_siu_audit_columns.Rd b/r-package/morie/man/morie_siu_audit_columns.Rd new file mode 100644 index 0000000000..0b4ece5979 --- /dev/null +++ b/r-package/morie/man/morie_siu_audit_columns.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_audit_columns} +\alias{morie_siu_audit_columns} +\title{Per-column accuracy audit: estimate every SIU column's correctness} +\usage{ +morie_siu_audit_columns( + case_numbers, + model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + max_examples_per_field = 5L, + progress = TRUE +) +} +\arguments{ +\item{case_numbers}{Character vector of SIU case numbers to audit.} + +\item{model}{One of \code{"ollama"} (default; free, runs locally, +zero-config when an Ollama daemon is on \code{localhost:11434}), +\code{"gemini"} (paid), or \code{"claude"} (paid). A character +vector enables fail-over: the first model whose call succeeds +wins. The default \code{c("ollama", "gemini")} tries the local +free model first and only escalates to paid Gemini if Ollama +isn't installed or fails -- so morie costs $0 to use as long +as you have a free Gemma / Qwen / Llama running locally +(e.g. \code{ollama pull gemma3:4b}).} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and +the optional \code{html/} subdirectory.} + +\item{max_html_chars}{Soft cap on the HTML payload sent to the +model (default 80,000 -- larger than any real SIU report, +small enough to stay under typical context budgets).} + +\item{max_examples_per_field}{Maximum disagreement examples +retained per field (default 5).} + +\item{progress}{Logical; print a per-case progress line.} +} +\value{ +A data frame with columns \code{field}, \code{n_audited}, +\code{n_agree}, \code{n_disagree}, \code{n_unclear}, +\code{agree_rate}. Sorted ascending by \code{agree_rate} so the +most-broken fields land at the top. The \code{"examples"} +attribute holds nested data frames of flagged cases per field. +} +\description{ +Runs \code{morie_siu_anomaly_check()} on a vector of case_numbers +and aggregates per-field across them. Output is a data frame with +one row per SIU column, ordered by how often the LLM auditor +agreed with the C++ parser. The worst-ranked rows are the +parser fields that most deserve regex / extraction-logic fixes. +} +\details{ +Examples of LLM-flagged disagreements are attached as the +\code{"examples"} attribute of the returned data frame (one +nested data frame per field), with at most +\code{max_examples_per_field} cases each. Each example carries +the case_number, the parser_value, and the LLM's one-sentence +reason -- enough for a maintainer to pop the cached HTML for +that case, see who's right, and decide whether to refine the +regex pattern for that field. + +Designed for cheap local audit: with \code{model = "ollama"} +pointed at a local Gemma / Qwen / DeepSeek instance, auditing +50-100 cases costs zero API spend and finishes in a few +minutes. With \code{model = c("gemini", "ollama")} the chain +uses paid Gemini first and silently falls back to the local +model on quota / network errors. +} +\examples{ +\dontrun{ +Sys.setenv( + OLLAMA_HOST = "http://localhost:11434", + OLLAMA_MODEL = "gemma3:4b" +) +csv <- morie_fetch_siu(cache_html = TRUE) +df <- utils::read.csv(csv, colClasses = "character") +sample <- sample(df$case_number[nzchar(df$case_number)], 50L) +audit <- morie_siu_audit_columns(sample, model = "ollama") +# Worst 8 fields, ripe for parser fixes: +head(audit, 8) +# See concrete disagreements for the worst field: +attr(audit, "examples")[[audit$field[1L]]] +} +} diff --git a/r-package/morie/man/morie_siu_compare.Rd b/r-package/morie/man/morie_siu_compare.Rd new file mode 100644 index 0000000000..2682bc827b --- /dev/null +++ b/r-package/morie/man/morie_siu_compare.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_compare} +\alias{morie_siu_compare} +\title{Field-by-field SIU comparison against a user-supplied external table} +\usage{ +morie_siu_compare( + case_number, + external, + field_map = NULL, + external_case_col = "Q1", + cache_dir = file.path(tempdir(), "morie", "siu") +) +} +\arguments{ +\item{case_number}{A case number (e.g. \code{"17-OVI-201"}).} + +\item{external}{A data frame of external answers, OR a path to an +\code{.xlsx} file (read with \code{readxl}). Must contain a +column whose values match SIU case numbers (default +\code{external_case_col = "Q1"}).} + +\item{field_map}{A named list mapping external-column names to +morie field names.} + +\item{external_case_col}{Name of the external column carrying the +case-number key.} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and +optional cached HTML.} +} +\value{ +A data frame with one row per mapped field: \code{field}, +\code{parser_value}, \code{external_value}, \code{agree}, and +\code{html_excerpt} (a 240-character window around the first +occurrence of either value in the cleaned report text). When +parser and external disagree, the \code{html_excerpt} is the +tie-breaker. +} +\description{ +For one case_number, line up the parser's value against the same +field in a user-supplied external data source -- and, critically, +show the surrounding report HTML so the user can adjudicate any +disagreement against the actual source document. +} +\details{ +\strong{The ground truth is the SIU director's-report HTML +itself.} The HTML is what the SIU published; the parser's job is +to extract structured fields from it faithfully, and any field's +correctness is decidable by reading the cached HTML for that +case. Any external reference -- a hand-coded survey, an +independently-scraped CSV, a colleague's analysis -- is just +another extraction attempt, possibly with its own errors. This +function does not endorse any external source; it only displays +both side-by-side with the HTML excerpt so you can decide. + +The default field map covers the common SIU-extraction column +layout (\code{Q1 = case_number}, \code{Q3 = police_service}, +\code{Q4 = number_of_officers_involved}, ...). Pass a custom +\code{field_map} for any other external schema. +} +\examples{ +\dontrun{ +# Caller supplies their own external table; nothing about the +# mapping or the file format is canonical to morie. +external <- data.frame(case_id = "17-OVI-201", officers = 1L) +cmp <- morie_siu_compare( + "17-OVI-201", + external = external, + field_map = list(officers = "number_of_officers_involved"), + external_case_col = "case_id" +) +subset(cmp, !agree) +} +} diff --git a/r-package/morie/man/morie_siu_index.Rd b/r-package/morie/man/morie_siu_index.Rd new file mode 100644 index 0000000000..b0a5834fbe --- /dev/null +++ b/r-package/morie/man/morie_siu_index.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_index} +\alias{morie_siu_index} +\title{SIU drid → case_number → language index} +\usage{ +morie_siu_index(lang = c("all", "en", "fr", "valid"), canonical_only = FALSE) +} +\arguments{ +\item{lang}{Filter rows by detected language. \code{"all"} +(default) returns every entry; \code{"en"} returns only the +English drids; \code{"fr"} returns only French; \code{"valid"} +returns every drid whose case_number was successfully parsed +(drops blank / draft drids).} + +\item{canonical_only}{If \code{TRUE}, returns one row per +case_number (the canonical drid for that case, English +preferred). Useful when you want a unique-cases index.} +} +\value{ +A data frame with columns \code{drid}, \code{http_code}, +\code{body_bytes}, \code{attempts}, \code{case_number}, +\code{_language}, \code{source}, \code{retrieved_at_utc}, +\code{canonical_drid}. +} +\description{ +Returns the shipped drid manifest as a data frame -- one row per +director's-report id morie has verified, with the parsed case +number, detected language, and the canonical drid (the English +drid for that case, or the first drid if no English version +exists). This is the index \code{morie_fetch_siu()} uses +internally; exposing it lets users: +} +\details{ +\itemize{ +\item see exactly which drids ship as known-valid (no need +to fetch to find out); +\item subset to English-only or French-only case lists +without running the full harvester; +\item map between drid (URL fragment) and case_number (SIU's +own identifier) offline. +} + +The manifest is refreshed by maintainers via +\code{morie_siu_refresh_manifest()}; it ships gzipped under +\code{inst/extdata/} at ~50 KB. +} +\examples{ +idx <- morie_siu_index(lang = "en") +head(idx) +# How many drids are English vs French vs unknown? +table(morie_siu_index()$`_language`) +# Unique-case index (English-preferred) +canon <- morie_siu_index(canonical_only = TRUE) +nrow(canon) +} diff --git a/r-package/morie/man/morie_siu_llm_extract.Rd b/r-package/morie/man/morie_siu_llm_extract.Rd new file mode 100644 index 0000000000..14f4543ed3 --- /dev/null +++ b/r-package/morie/man/morie_siu_llm_extract.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_llm_extract} +\alias{morie_siu_llm_extract} +\title{Extract SIU report fields with an LLM (Gemini or Claude)} +\usage{ +morie_siu_llm_extract( + case_number, + model = c("ollama", "gemini"), + cache_dir = file.path(tempdir(), "morie", "siu"), + max_html_chars = 80000L, + mock_response_text = NULL +) +} +\arguments{ +\item{case_number}{An SIU case number (e.g. \code{"17-OVI-201"}).} + +\item{model}{One of \code{"ollama"} (default; free, runs locally, +zero-config when an Ollama daemon is on \code{localhost:11434}), +\code{"gemini"} (paid), or \code{"claude"} (paid). A character +vector enables fail-over: the first model whose call succeeds +wins. The default \code{c("ollama", "gemini")} tries the local +free model first and only escalates to paid Gemini if Ollama +isn't installed or fails -- so morie costs $0 to use as long +as you have a free Gemma / Qwen / Llama running locally +(e.g. \code{ollama pull gemma3:4b}).} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and +the optional \code{html/} subdirectory.} + +\item{max_html_chars}{Soft cap on the HTML payload sent to the +model (default 80,000 -- larger than any real SIU report, +small enough to stay under typical context budgets).} + +\item{mock_response_text}{For testing only: if non-NULL, skip the +network call and use this string as the model's raw reply.} +} +\value{ +A one-row data frame with the 64 morie SIU columns. Any +field the model could not extract is the empty string +(matching the C++ parser's convention). +} +\description{ +Sends the cached director's-report HTML for one case through a +large-language-model endpoint and asks it to return the 64-column +morie schema as JSON. The result is in the SAME row format as the +C++ parser, so it drops straight into \code{morie_siu_compare()} +as the \code{external} argument for an independent diff against +the parser. +} +\details{ +The cached HTML remains the ground truth. This function does not +claim the LLM is more accurate than the regex parser; it provides +a fast second extraction so disagreements between two independent +methods (regex vs. LLM) can be flagged for human review against +the saved report. + +Credentials are read from environment variables only -- never +hard-coded, never passed as function arguments -- so secrets do +not leak into call traces, logs, or scripts. Set +\code{GOOGLE_API_KEY} for Gemini, \code{ANTHROPIC_API_KEY} for +Claude, or \code{OLLAMA_HOST} (e.g. +\code{"http://localhost:11434"} or an OllamaFreeAPI base URL) plus +optionally \code{OLLAMA_MODEL} (default \code{"llama3.2:3b"}) for +Ollama-compatible open-weight endpoints. +} +\examples{ +\dontrun{ +Sys.setenv(GOOGLE_API_KEY = "your-gemini-key") +r <- morie_siu_llm_extract("17-OVI-201", model = "gemini") +# Diff parser vs LLM against the HTML: +morie_siu_compare( + "17-OVI-201", + external = r, + field_map = setNames(as.list(names(r)), names(r)), + external_case_col = "case_number" +) +} +} diff --git a/r-package/morie/man/morie_siu_record_correction.Rd b/r-package/morie/man/morie_siu_record_correction.Rd new file mode 100644 index 0000000000..b40b34a5a7 --- /dev/null +++ b/r-package/morie/man/morie_siu_record_correction.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_record_correction} +\alias{morie_siu_record_correction} +\title{Record a verified correction to the SIU parser's output} +\usage{ +morie_siu_record_correction( + case_number, + field, + verified_value, + note = "", + cache_dir = file.path(tempdir(), "morie", "siu") +) +} +\arguments{ +\item{case_number}{SIU case number, e.g. \code{"17-OVI-201"}.} + +\item{field}{Name of the column in the SIU schema (e.g. +\code{"location_of_call"}).} + +\item{verified_value}{The correct value, verified against the +cached HTML (see \code{morie_siu_audit_case()}).} + +\item{note}{Optional one-line note describing the basis for the +correction (HTML excerpt, LLM verdict, etc.).} + +\item{cache_dir}{Directory holding the harvester's SIU.csv.} +} +\value{ +Invisibly, the path to the updated overrides CSV. +} +\description{ +Saves a (case_number, field, verified_value) tuple to a local +overrides CSV at \code{/canonical_overrides.csv}. Every +subsequent \code{morie_fetch_siu()} on that \code{cache_dir} will +overlay these corrections onto the regex-parsed output. The shipped +\code{inst/extdata/siu_canonical_overrides.csv.gz} carries +maintainer-confirmed corrections; this function lets users add +their own without touching the package source. +} +\details{ +This is the "memory" of the parser: every wrong cell you find and +fix becomes permanent for that cache_dir. Maintainers can submit +corrections upstream by sharing the resulting CSV file. +} +\examples{ +\donttest{ +# Writes the correction to a temp cache so the example never +# touches the per-user cache directory. +tmp <- tempfile("morie_siu_"); dir.create(tmp, recursive = TRUE) +morie_siu_record_correction( + case_number = "17-OVI-201", + field = "location_of_call", + verified_value = "Clair Road East, City of Guelph", + note = "HTML excerpt: 'on Clair Road East in the City of Guelph'", + cache_dir = tmp +) +unlink(tmp, recursive = TRUE) +} +} diff --git a/r-package/morie/man/morie_siu_refresh_manifest.Rd b/r-package/morie/man/morie_siu_refresh_manifest.Rd new file mode 100644 index 0000000000..cb1a8a3099 --- /dev/null +++ b/r-package/morie/man/morie_siu_refresh_manifest.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_refresh_manifest} +\alias{morie_siu_refresh_manifest} +\title{Rebuild the Ontario SIU DRID manifest by probing the live site} +\usage{ +morie_siu_refresh_manifest( + out_path = NULL, + max_drid = NULL, + min_drid = 1L, + concurrency = 4L, + rate_rps = 4, + progress = TRUE +) +} +\arguments{ +\item{out_path}{Path to write the gzipped CSV. Default is the +in-place manifest location (only useful for maintainers building +from a source checkout).} + +\item{max_drid}{Highest drid to probe. Default \code{NULL} +auto-discovers from the SIU index endpoint and adds a margin.} + +\item{min_drid}{Lowest drid to probe (default \code{1L}).} + +\item{concurrency}{Maximum simultaneous transfers (default \code{4}).} + +\item{rate_rps}{Maximum request starts per second (default \code{4}).} + +\item{progress}{Logical; print a per-batch progress line.} +} +\value{ +Invisibly, a data frame of the full sweep (every probed drid, +including misses), parallel to what was written to \code{out_path}. +} +\description{ +Sweeps director's-report ids \code{1..max_drid} and writes a small +CSV recording which ids return a healthy report page, the parsed +case number, and the response body size. The harvester +(\code{morie_fetch_siu}) then uses this manifest to short-circuit +the ~30-50 percent of ids that have no report, saving bandwidth and +WAF-trigger risk on every run. +} +\details{ +The shipped manifest at \code{inst/extdata/siu_drid_manifest.csv.gz} +is a snapshot. Users who want the latest can call this function; +it is also how morie maintainers regenerate the snapshot. +} +\examples{ +\dontrun{ +# Network: refreshes the manifest by probing the SIU site +# (~25-40 min at the default polite rate of 4 RPS for ~6000 ids). +df <- morie_siu_refresh_manifest(out_path = tempfile(fileext = ".csv.gz")) +table(df$http_code) +} +} diff --git a/r-package/morie/man/morie_siu_sanity_check.Rd b/r-package/morie/man/morie_siu_sanity_check.Rd new file mode 100644 index 0000000000..f03e9c3362 --- /dev/null +++ b/r-package/morie/man/morie_siu_sanity_check.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_sanity_check} +\alias{morie_siu_sanity_check} +\title{Row-level sanity check on a parsed SIU table (regex-only, no LLM)} +\usage{ +morie_siu_sanity_check(df) +} +\arguments{ +\item{df}{A data frame in the morie SIU 64-column schema, or a +path to such a CSV.} +} +\value{ +A data frame with one row per source row, columns: +\code{case_number}, \code{drid}, \code{issues_count} (integer +number of suspicious cells), \code{issues} (semicolon-separated +string of \code{field:reason} pairs). Ordered descending by +\code{issues_count}. +} +\description{ +For every row in a parser-emitted SIU table, flag cells that +don't match the expected format for their column -- \code{case_number} +that doesn't look like an SIU case id, \verb{date_*_iso} that isn't a +valid ISO 8601 date, \verb{number_of_*} that isn't a positive integer, +\code{charges_recommended} that isn't "Yes" / "No", etc. Returns a +data frame ranked by issue count so the most-broken rows surface +at the top for manual inspection against the cached HTML. +} +\details{ +Designed to be a fast first-pass quality filter -- runs in +milliseconds, no network, no LLM, no API key. Doesn't try to +verify correctness against the underlying report (that's what +\code{morie_siu_audit_columns()} is for); just checks that each +value MATCHES THE EXPECTED FORMAT for its field. A clean sanity +check is necessary but not sufficient for correctness. +} +\examples{ +\dontrun{ +csv <- morie_fetch_siu(cache_dir = tempdir(), cache_html = TRUE) +sanity <- morie_siu_sanity_check(csv) +head(sanity, 10) # worst 10 rows -- inspect against HTML +table(sanity$issues_count) +} +} diff --git a/r-package/morie/man/morie_siu_translate.Rd b/r-package/morie/man/morie_siu_translate.Rd new file mode 100644 index 0000000000..3c3e8d7db3 --- /dev/null +++ b/r-package/morie/man/morie_siu_translate.Rd @@ -0,0 +1,111 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/siu.R +\name{morie_siu_translate} +\alias{morie_siu_translate} +\alias{morie_siu_translate_fr_to_en} +\title{Translate SIU report text into any target language via local LLM} +\usage{ +morie_siu_translate( + target_lang = NULL, + source_lang = NULL, + case_numbers = NULL, + model = "ollama", + fields = c("narrative_summary", "news_release_summary", "news_release_title", + "relevant_legislation"), + cache_dir = file.path(tempdir(), "morie", "siu"), + progress = TRUE +) + +morie_siu_translate_fr_to_en( + case_numbers = NULL, + model = "ollama", + fields = c("narrative_summary", "news_release_summary", "news_release_title", + "relevant_legislation"), + cache_dir = file.path(tempdir(), "morie", "siu"), + progress = TRUE +) +} +\arguments{ +\item{target_lang}{Target ISO 639-1 language code (or full +language name). Defaults to \code{Sys.getenv("MORIE_USER_LANG")} +or, failing that, the first two characters of +\code{Sys.getenv("LANG")} -- so it picks up the user's +system locale automatically.} + +\item{source_lang}{Source language code, or \code{NULL} (default) +to use each row's parsed \code{_language} field.} + +\item{case_numbers}{Character vector of SIU case numbers to +translate. Defaults to every row whose \code{_language} +differs from \code{target_lang} and has no override yet.} + +\item{model}{LLM model chain (see \code{\link{morie_siu_llm_extract}}). +Default \code{"ollama"} for $0 cost via local Gemma / etc.} + +\item{fields}{Which text fields to translate. Defaults to the +long-form fields that benefit most from translation: +\code{narrative_summary}, \code{news_release_summary}, +\code{news_release_title}, \code{relevant_legislation}.} + +\item{cache_dir}{Directory holding the harvester's SIU.csv and +cached HTML.} + +\item{progress}{Print per-case progress.} +} +\value{ +Invisibly, a data frame of newly-recorded +(case_number, field, verified_value) translations. +} +\description{ +For SIU cases whose parser-emitted text isn't in the reader's +preferred language, translate the long-form text fields into +\code{target_lang} via a local Ollama model (default $0 cost, +no API key) and save each translation as a canonical override. +Subsequent \code{morie_fetch_siu()} runs then return text in +\code{target_lang} for those cases automatically. + +\code{morie_siu_translate_fr_to_en} is a thin +back-compat wrapper that calls \code{morie_siu_translate} +with \code{target_lang = "en", source_lang = "fr"}. +} +\details{ +Use cases: +\itemize{ +\item French-only SIU reports (a few per year of SIU output) +that have no English-paired drid -- translate to "en" +so downstream analyses can join them with the rest. +\item English SIU reports that a Hindi / Spanish / Mandarin / +Punjabi / Arabic / etc. reader needs -- translate to +their first language for accessibility. +\item Any cross-language pivot for community-oriented +publication, where the reader's first language isn't +what the SIU originally published in. +} + +Idempotent (skips cases that already have an override on file +for this \code{target_lang}). Self-improving (every translation +accumulates in \code{/canonical_overrides.csv}, so +the SIU table becomes more accessible every time you run this). +Maintainers can promote the resulting overrides into the +shipped \code{inst/extdata/siu_canonical_overrides.csv.gz}. + +For best speed/quality on multilingual translation use +\code{OLLAMA_MODEL=translategemma:latest} -- a Gemma model +fine-tuned for translation. Falls back to whatever model +\code{OLLAMA_MODEL} points at. +} +\examples{ +\dontrun{ +Sys.setenv( + OLLAMA_HOST = "http://localhost:11434", + OLLAMA_MODEL = "translategemma:latest" +) +csv <- morie_fetch_siu(cache_html = TRUE) +# Translate every non-English row to English: +morie_siu_translate(target_lang = "en") +# Or translate everything to Hindi for a Hindi-first reader: +morie_siu_translate(target_lang = "hi") +# Re-fetch picks up the new overrides automatically: +csv <- morie_fetch_siu(overwrite = TRUE) +} +} diff --git a/r-package/morie/man/morie_spearman_rho.Rd b/r-package/morie/man/morie_spearman_rho.Rd new file mode 100644 index 0000000000..fb006a286b --- /dev/null +++ b/r-package/morie/man/morie_spearman_rho.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_spearman_rho} +\alias{morie_spearman_rho} +\title{Spearman rank correlation} +\usage{ +morie_spearman_rho(x, y) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{y}{Numeric vector.} +} +\value{ +Named list: \code{rho}, \code{p_value}. +} +\description{ +Spearman rank correlation +} +\examples{ +morie_spearman_rho(x = rnorm(50), y = rnorm(50)) +} diff --git a/r-package/morie/man/spectral_density.Rd b/r-package/morie/man/morie_spectral_density.Rd similarity index 69% rename from r-package/morie/man/spectral_density.Rd rename to r-package/morie/man/morie_spectral_density.Rd index f49a7cf114..c64acc6a45 100644 --- a/r-package/morie/man/spectral_density.Rd +++ b/r-package/morie/man/morie_spectral_density.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/specf.R -\name{spectral_density} -\alias{spectral_density} +\name{morie_spectral_density} +\alias{morie_spectral_density} \title{Welch power spectral density} \usage{ -spectral_density(x, fs = 1, nperseg = NULL) +morie_spectral_density(x, fs = 1, nperseg = NULL) } \arguments{ \item{x}{Numeric univariate series.} @@ -21,8 +21,5 @@ Named list with \code{frequencies, psd, n_segments, nperseg, Welch power spectral density } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_spectral_density(x = rnorm(50)) } diff --git a/r-package/morie/man/state_space_model.Rd b/r-package/morie/man/morie_state_space_model.Rd similarity index 70% rename from r-package/morie/man/state_space_model.Rd rename to r-package/morie/man/morie_state_space_model.Rd index d7888d2990..65a885015f 100644 --- a/r-package/morie/man/state_space_model.Rd +++ b/r-package/morie/man/morie_state_space_model.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ssmod.R -\name{state_space_model} -\alias{state_space_model} +\name{morie_state_space_model} +\alias{morie_state_space_model} \title{Local-level state-space model (Kalman filter+smoother)} \usage{ -state_space_model(x) +morie_state_space_model(x) } \arguments{ \item{x}{Numeric univariate series.} @@ -17,8 +17,5 @@ Named list with \code{filtered_state, filtered_state_variance, Local-level state-space model (Kalman filter+smoother) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_state_space_model(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_stratified_sample.Rd b/r-package/morie/man/morie_stratified_sample.Rd new file mode 100644 index 0000000000..d68299b5bf --- /dev/null +++ b/r-package/morie/man/morie_stratified_sample.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampling.R +\name{morie_stratified_sample} +\alias{morie_stratified_sample} +\title{Proportional or fixed stratified random sample} +\usage{ +morie_stratified_sample( + df, + strata_col, + n_per_stratum, + proportional = FALSE, + seed = 42L +) +} +\arguments{ +\item{df}{A data frame.} + +\item{strata_col}{Name of the stratification column.} + +\item{n_per_stratum}{Either an integer (equal allocation) or a named integer +vector mapping stratum levels to sample sizes. If \code{proportional = TRUE}, +\code{n_per_stratum} is treated as the total desired sample size and allocation +is proportional to stratum size.} + +\item{proportional}{Logical; if \code{TRUE}, allocate proportionally to strata sizes.} + +\item{seed}{Random seed.} +} +\value{ +Data frame of sampled rows with a \code{.weight} column. +} +\description{ +Proportional or fixed stratified random sample +} +\examples{ +df <- data.frame(g = c(rep("A", 60), rep("B", 40)), x = rnorm(100)) +morie_stratified_sample(df, "g", n_per_stratum = 10) +} diff --git a/r-package/morie/man/suggest_analysis_plan.Rd b/r-package/morie/man/morie_suggest_analysis_plan.Rd similarity index 50% rename from r-package/morie/man/suggest_analysis_plan.Rd rename to r-package/morie/man/morie_suggest_analysis_plan.Rd index bff651e35f..147983e84a 100644 --- a/r-package/morie/man/suggest_analysis_plan.Rd +++ b/r-package/morie/man/morie_suggest_analysis_plan.Rd @@ -1,22 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataset_profile.R -\name{suggest_analysis_plan} -\alias{suggest_analysis_plan} +\name{morie_suggest_analysis_plan} +\alias{morie_suggest_analysis_plan} \title{Suggest an analysis plan from a dataset profile} \usage{ -suggest_analysis_plan(profile) +morie_suggest_analysis_plan(profile) } \arguments{ -\item{profile}{A list returned by \code{\link[=profile_dataset]{profile_dataset()}}.} +\item{profile}{A list returned by \code{\link[=morie_profile_dataset]{morie_profile_dataset()}}.} } \value{ Character vector of suggestion strings, one per recommendation. } \description{ Mirrors the Python \code{morie.suggest_analysis_plan()}. Inspects the output -of \code{\link[=profile_dataset]{profile_dataset()}} and returns plain-English recommendations for +of \code{\link[=morie_profile_dataset]{morie_profile_dataset()}} and returns plain-English recommendations for candidate analyses. } \examples{ -suggest_analysis_plan(profile_dataset(iris)) +morie_suggest_analysis_plan(morie_profile_dataset(iris)) } diff --git a/r-package/morie/man/sukhatme_test.Rd b/r-package/morie/man/morie_sukhatme_test.Rd similarity index 71% rename from r-package/morie/man/sukhatme_test.Rd rename to r-package/morie/man/morie_sukhatme_test.Rd index 4e7241f183..6a6aeab3d1 100644 --- a/r-package/morie/man/sukhatme_test.Rd +++ b/r-package/morie/man/morie_sukhatme_test.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/sukht.R -\name{sukhatme_test} -\alias{sukhatme_test} +\name{morie_sukhatme_test} +\alias{morie_sukhatme_test} \title{Sukhatme two-sample scale test (Gibbons Ch 9.7)} \usage{ -sukhatme_test(x, y) +morie_sukhatme_test(x, y) } \arguments{ \item{x, y}{Numeric vectors.} @@ -17,8 +17,5 @@ Mann-Whitney U on the absolute deviations from the pooled median. Tests equality of scales given (approximately) equal medians. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_sukhatme_test(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_summarize_output_audit.Rd b/r-package/morie/man/morie_summarize_output_audit.Rd new file mode 100644 index 0000000000..3851d1e133 --- /dev/null +++ b/r-package/morie/man/morie_summarize_output_audit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{morie_summarize_output_audit} +\alias{morie_summarize_output_audit} +\title{Summarize an output audit} +\usage{ +morie_summarize_output_audit(audit_tbl) +} +\arguments{ +\item{audit_tbl}{Result from \code{\link[=morie_audit_public_outputs]{morie_audit_public_outputs()}}.} +} +\value{ +Named list with high-level diagnostics. +} +\description{ +Summarize an output audit +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/svm_genomic.Rd b/r-package/morie/man/morie_svm_genomic.Rd similarity index 74% rename from r-package/morie/man/svm_genomic.Rd rename to r-package/morie/man/morie_svm_genomic.Rd index ef10d35073..82e5026b9c 100644 --- a/r-package/morie/man/svm_genomic.Rd +++ b/r-package/morie/man/morie_svm_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/svmge.R -\name{svm_genomic} -\alias{svm_genomic} +\name{morie_svm_genomic} +\alias{morie_svm_genomic} \title{Support-vector regression for genomic prediction} \usage{ -svm_genomic(x, y, markers, C = 1, epsilon = 0.1, gamma = "scale") +morie_svm_genomic(x, y, markers, C = 1, epsilon = 0.1, gamma = "scale") } \arguments{ \item{x}{Optional fixed-effect features.} @@ -26,10 +26,7 @@ list(estimate, y_hat, alpha, support_indices, se, n, method). Support-vector regression for genomic prediction } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_svm_genomic(x = rnorm(50), y = rnorm(50), markers = matrix(sample(0:2, 200, TRUE), 50, 4)) } \references{ Vapnik (1995); Montesinos Lopez Ch 7. diff --git a/r-package/morie/man/svm_hinge_primal.Rd b/r-package/morie/man/morie_svm_hinge_primal.Rd similarity index 71% rename from r-package/morie/man/svm_hinge_primal.Rd rename to r-package/morie/man/morie_svm_hinge_primal.Rd index 00ab6043b7..fc2e274a05 100644 --- a/r-package/morie/man/svm_hinge_primal.Rd +++ b/r-package/morie/man/morie_svm_hinge_primal.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/svmhg.R -\name{svm_hinge_primal} -\alias{svm_hinge_primal} +\name{morie_svm_hinge_primal} +\alias{morie_svm_hinge_primal} \title{Linear SVM (primal hinge loss) -- R parity} \usage{ -svm_hinge_primal(x, y, C = 1, seed = 0L) +morie_svm_hinge_primal(x, y, C = 1, seed = 0L) } \arguments{ \item{x}{Numeric predictor matrix.} @@ -23,8 +23,6 @@ classes, n, method. Wraps \code{e1071::svm} with a linear kernel. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/svm_kernel_trick.Rd b/r-package/morie/man/morie_svm_kernel_trick.Rd similarity index 81% rename from r-package/morie/man/svm_kernel_trick.Rd rename to r-package/morie/man/morie_svm_kernel_trick.Rd index 10c78e3281..2d68b2c345 100644 --- a/r-package/morie/man/svm_kernel_trick.Rd +++ b/r-package/morie/man/morie_svm_kernel_trick.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/svmkr.R -\name{svm_kernel_trick} -\alias{svm_kernel_trick} +\name{morie_svm_kernel_trick} +\alias{morie_svm_kernel_trick} \title{Kernel SVM (RBF / poly / sigmoid) -- R parity} \usage{ -svm_kernel_trick( +morie_svm_kernel_trick( x, y, kernel = "rbf", @@ -37,8 +37,5 @@ gamma, degree, n, method. Wraps \code{e1071::svm}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_svm_kernel_trick(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_sync_rng.Rd b/r-package/morie/man/morie_sync_rng.Rd index 72fc38588c..6022457f55 100644 --- a/r-package/morie/man/morie_sync_rng.Rd +++ b/r-package/morie/man/morie_sync_rng.Rd @@ -20,8 +20,5 @@ on the Python side so the two emit identical streams when given the same seed. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_sync_rng(seed = 1L) } diff --git a/r-package/morie/man/terry_hoeffding_test.Rd b/r-package/morie/man/morie_terry_hoeffding_test.Rd similarity index 62% rename from r-package/morie/man/terry_hoeffding_test.Rd rename to r-package/morie/man/morie_terry_hoeffding_test.Rd index f94c8c2908..5d73fd61a5 100644 --- a/r-package/morie/man/terry_hoeffding_test.Rd +++ b/r-package/morie/man/morie_terry_hoeffding_test.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/thfdt.R -\name{terry_hoeffding_test} -\alias{terry_hoeffding_test} +\name{morie_terry_hoeffding_test} +\alias{morie_terry_hoeffding_test} \title{Terry-Hoeffding (Fisher-Yates) two-sample normal-scores test (Gibbons Ch 8.3.1)} \usage{ -terry_hoeffding_test(x, y) +morie_terry_hoeffding_test(x, y) } \arguments{ \item{x, y}{Numeric vectors.} @@ -15,12 +15,9 @@ Named list: statistic, p_value, z, n, m. } \description{ Replaces pooled ranks with Blom-approximated normal scores -a_i = qnorm((R_i - 3/8) / (N + 1/4)). Statistic T = sum of +a_i = qnorm((R_i - 3/8) / (N + 1/4)). Statistic stat_t = sum of scores from the first sample. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_terry_hoeffding_test(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/tgarch_model.Rd b/r-package/morie/man/morie_tgarch_model.Rd similarity index 70% rename from r-package/morie/man/tgarch_model.Rd rename to r-package/morie/man/morie_tgarch_model.Rd index 483b7d3f69..9c342bf940 100644 --- a/r-package/morie/man/tgarch_model.Rd +++ b/r-package/morie/man/morie_tgarch_model.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tgrch.R -\name{tgarch_model} -\alias{tgarch_model} +\name{morie_tgarch_model} +\alias{morie_tgarch_model} \title{GJR-GARCH(1,1) threshold GARCH} \usage{ -tgarch_model(x) +morie_tgarch_model(x) } \arguments{ \item{x}{Numeric return series.} @@ -17,8 +17,5 @@ Named list with \code{omega, alpha, gamma, beta, persistence, GJR-GARCH(1,1) threshold GARCH } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_tgarch_model(x = rnorm(50)) } diff --git a/r-package/morie/man/threshold_autoregression.Rd b/r-package/morie/man/morie_threshold_autoregression.Rd similarity index 73% rename from r-package/morie/man/threshold_autoregression.Rd rename to r-package/morie/man/morie_threshold_autoregression.Rd index c1b881f5fc..88c01e0f57 100644 --- a/r-package/morie/man/threshold_autoregression.Rd +++ b/r-package/morie/man/morie_threshold_autoregression.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tarmd.R -\name{threshold_autoregression} -\alias{threshold_autoregression} +\name{morie_threshold_autoregression} +\alias{morie_threshold_autoregression} \title{Two-regime self-exciting threshold autoregressive (SETAR) model} \usage{ -threshold_autoregression(x, p = 1, d = 1, n_grid = 50) +morie_threshold_autoregression(x, p = 1, d = 1, n_grid = 50) } \arguments{ \item{x}{Numeric univariate series.} @@ -23,8 +23,5 @@ Named list with \code{threshold, phi_lower, phi_upper, p, d, Two-regime self-exciting threshold autoregressive (SETAR) model } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_threshold_autoregression(x = rnorm(50)) } diff --git a/r-package/morie/man/tolerance_limits.Rd b/r-package/morie/man/morie_tolerance_limits.Rd similarity index 80% rename from r-package/morie/man/tolerance_limits.Rd rename to r-package/morie/man/morie_tolerance_limits.Rd index d61ec37e54..7eec6d41c5 100644 --- a/r-package/morie/man/tolerance_limits.Rd +++ b/r-package/morie/man/morie_tolerance_limits.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tolim.R -\name{tolerance_limits} -\alias{tolerance_limits} +\name{morie_tolerance_limits} +\alias{morie_tolerance_limits} \title{Distribution-free (Wilks) tolerance limits} \usage{ -tolerance_limits(x, coverage = 0.9, confidence = 0.95) +morie_tolerance_limits(x, coverage = 0.9, confidence = 0.95) } \arguments{ \item{x}{Numeric vector.} @@ -27,7 +27,7 @@ P(coverage of (X_(1), X_(n)) >= beta) = 1 - n * beta^(n-1) + (n - 1) * beta^n } \examples{ -tolerance_limits(1:100, coverage = 0.90, confidence = 0.95) +morie_tolerance_limits(1:100, coverage = 0.90, confidence = 0.95) } \references{ Wilks (1941); Gibbons & Chakraborti (6e) Ch 2.11. diff --git a/r-package/morie/man/morie_tps_layer_urls.Rd b/r-package/morie/man/morie_tps_layer_urls.Rd index 3b6082f61c..c8b9201752 100644 --- a/r-package/morie/man/morie_tps_layer_urls.Rd +++ b/r-package/morie/man/morie_tps_layer_urls.Rd @@ -15,6 +15,6 @@ TPS ArcGIS layer URLs known to MORIE } \examples{ urls <- morie_tps_layer_urls() -names(urls) # categories: Assault, AutoTheft, Homicide, ... -length(urls) # number of layers +names(urls) # categories: Assault, AutoTheft, Homicide, ... +length(urls) # number of layers } diff --git a/r-package/morie/man/transformer_genomic.Rd b/r-package/morie/man/morie_transformer_genomic.Rd similarity index 82% rename from r-package/morie/man/transformer_genomic.Rd rename to r-package/morie/man/morie_transformer_genomic.Rd index 11bd7867fa..efdbb6ee6c 100644 --- a/r-package/morie/man/transformer_genomic.Rd +++ b/r-package/morie/man/morie_transformer_genomic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/trfge.R -\name{transformer_genomic} -\alias{transformer_genomic} +\name{morie_transformer_genomic} +\alias{morie_transformer_genomic} \title{Transformer (1-head self-attention) genomic predictor (base R)} \usage{ -transformer_genomic( +morie_transformer_genomic( x, y, markers, @@ -39,10 +39,10 @@ list(estimate, y_hat, beta, attention, context, se, n, method). Random fixed projections + ridge head on the mean-pooled context vector. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_transformer_genomic( + x = rnorm(50), y = rnorm(50), + markers = matrix(sample(0:2, 200, TRUE), 50, 4) +) } \references{ Vaswani et al. (2017). Montesinos Lopez Ch 15. diff --git a/r-package/morie/man/trfbl_transformer_block.Rd b/r-package/morie/man/morie_trfbl_transformer_block.Rd similarity index 81% rename from r-package/morie/man/trfbl_transformer_block.Rd rename to r-package/morie/man/morie_trfbl_transformer_block.Rd index af5a310b9d..3eda1cb2e9 100644 --- a/r-package/morie/man/trfbl_transformer_block.Rd +++ b/r-package/morie/man/morie_trfbl_transformer_block.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/trfbl.R -\name{trfbl_transformer_block} -\alias{trfbl_transformer_block} -\alias{transformer_block} +\name{morie_trfbl_transformer_block} +\alias{morie_trfbl_transformer_block} +\alias{morie_transformer_block} \title{Transformer encoder block (post-LN)} \usage{ -trfbl_transformer_block( +morie_trfbl_transformer_block( x, num_heads = 2L, d_ff = NULL, @@ -13,7 +13,7 @@ trfbl_transformer_block( deterministic_seed = NULL ) -transformer_block( +morie_transformer_block( x, num_heads = 2L, d_ff = NULL, @@ -46,10 +46,8 @@ R parity for \code{morie.fn.trfbl.transformer_block}. h_2 = \mathrm{LN}(h_1 + \mathrm{FFN}(h_1))} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Vaswani et al. (2017), NeurIPS. diff --git a/r-package/morie/man/tsne_reduction.Rd b/r-package/morie/man/morie_tsne_reduction.Rd similarity index 86% rename from r-package/morie/man/tsne_reduction.Rd rename to r-package/morie/man/morie_tsne_reduction.Rd index be95f69eef..a9918866c6 100644 --- a/r-package/morie/man/tsne_reduction.Rd +++ b/r-package/morie/man/morie_tsne_reduction.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tsnrd.R -\name{tsne_reduction} -\alias{tsne_reduction} +\name{morie_tsne_reduction} +\alias{morie_tsne_reduction} \title{t-SNE for non-linear dimension reduction (R parity)} \usage{ -tsne_reduction( +morie_tsne_reduction( x, n_components = 2L, perplexity = 30, @@ -40,8 +40,6 @@ perplexity, n_components, n, method. Wraps \code{Rtsne::Rtsne}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } diff --git a/r-package/morie/man/two_sample_coverage.Rd b/r-package/morie/man/morie_two_sample_coverage.Rd similarity index 74% rename from r-package/morie/man/two_sample_coverage.Rd rename to r-package/morie/man/morie_two_sample_coverage.Rd index 94a425906e..495f0970f8 100644 --- a/r-package/morie/man/two_sample_coverage.Rd +++ b/r-package/morie/man/morie_two_sample_coverage.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cov2s.R -\name{two_sample_coverage} -\alias{two_sample_coverage} +\name{morie_two_sample_coverage} +\alias{morie_two_sample_coverage} \title{Two-sample placement coverage (Gibbons Ch 2.11.2)} \usage{ -two_sample_coverage(x, y) +morie_two_sample_coverage(x, y) } \arguments{ \item{x}{Numeric vector (first sample).} @@ -21,8 +21,5 @@ ordered X-sample. Under H0 the expected block proportion is \code{1 / (m + 1)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_two_sample_coverage(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/morie_two_sample_t_test.Rd b/r-package/morie/man/morie_two_sample_t_test.Rd new file mode 100644 index 0000000000..0d6fcdd126 --- /dev/null +++ b/r-package/morie/man/morie_two_sample_t_test.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_two_sample_t_test} +\alias{morie_two_sample_t_test} +\title{Two-sample t-test with tidy output} +\usage{ +morie_two_sample_t_test( + x1, + x2, + equal_var = FALSE, + alternative = c("two.sided", "greater", "less") +) +} +\arguments{ +\item{x1}{Numeric vector (group 1).} + +\item{x2}{Numeric vector (group 2).} + +\item{equal_var}{Assume equal variances? Default \code{FALSE} (Welch test).} + +\item{alternative}{\code{"two.sided"}, \code{"greater"}, or \code{"less"}.} +} +\value{ +Named list: \code{t}, \code{df}, \code{p_value}, \code{ci_diff}, \code{morie_cohens_d}. +} +\description{ +Two-sample t-test with tidy output +} +\examples{ +morie_two_sample_t_test(rnorm(50, 0.5), rnorm(50, 0)) +} diff --git a/r-package/morie/man/unobserved_components.Rd b/r-package/morie/man/morie_unobserved_components.Rd similarity index 71% rename from r-package/morie/man/unobserved_components.Rd rename to r-package/morie/man/morie_unobserved_components.Rd index f60c87c593..526591c1b4 100644 --- a/r-package/morie/man/unobserved_components.Rd +++ b/r-package/morie/man/morie_unobserved_components.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ucmod.R -\name{unobserved_components} -\alias{unobserved_components} +\name{morie_unobserved_components} +\alias{morie_unobserved_components} \title{Unobserved-components decomposition (trend + seasonal + irregular)} \usage{ -unobserved_components(x, period = 12, trend = "local linear") +morie_unobserved_components(x, period = 12, trend = "local linear") } \arguments{ \item{x}{Numeric univariate series.} @@ -21,8 +21,5 @@ Named list with \code{trend, seasonal, irregular, loglik, n, Unobserved-components decomposition (trend + seasonal + irregular) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_unobserved_components(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_userguide.Rd b/r-package/morie/man/morie_userguide.Rd index df6f7eb81d..6a39c287fd 100644 --- a/r-package/morie/man/morie_userguide.Rd +++ b/r-package/morie/man/morie_userguide.Rd @@ -18,8 +18,5 @@ Lists or retrieves bundled userguide PDF files. These are the official PUMF codebooks and user guides from Health Canada / Statistics Canada. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_userguide() } diff --git a/r-package/morie/man/vaenc_vae_elbo.Rd b/r-package/morie/man/morie_vaenc_vae_elbo.Rd similarity index 73% rename from r-package/morie/man/vaenc_vae_elbo.Rd rename to r-package/morie/man/morie_vaenc_vae_elbo.Rd index 51b18dbf8e..a7fbd1e20e 100644 --- a/r-package/morie/man/vaenc_vae_elbo.Rd +++ b/r-package/morie/man/morie_vaenc_vae_elbo.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/vaenc.R -\name{vaenc_vae_elbo} -\alias{vaenc_vae_elbo} -\alias{vae_elbo} +\name{morie_vaenc_vae_elbo} +\alias{morie_vaenc_vae_elbo} +\alias{morie_vae_elbo} \title{Variational autoencoder ELBO} \usage{ -vaenc_vae_elbo(x, x_recon, mu, log_var, reduction = "mean") +morie_vaenc_vae_elbo(x, x_recon, mu, log_var, reduction = "mean") -vae_elbo(x, x_recon, mu, log_var, reduction = "mean") +morie_vae_elbo(x, x_recon, mu, log_var, reduction = "mean") } \arguments{ \item{x}{Original input.} @@ -35,10 +35,8 @@ Gaussian encoder + standard-normal prior closed-form KL: ELBO uses Gaussian (MSE) reconstruction term. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Kingma & Welling (2014), ICLR. diff --git a/r-package/morie/man/morie_validate_cpads_data.Rd b/r-package/morie/man/morie_validate_cpads_data.Rd new file mode 100644 index 0000000000..9b6c82a124 --- /dev/null +++ b/r-package/morie/man/morie_validate_cpads_data.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipw.R +\name{morie_validate_cpads_data} +\alias{morie_validate_cpads_data} +\title{Validate a CPADS analysis data frame} +\usage{ +morie_validate_cpads_data(data, strict = TRUE) +} +\arguments{ +\item{data}{Data frame to validate.} + +\item{strict}{If \code{TRUE}, stop when required variables are missing.} +} +\value{ +Character vector of missing variable names. +} +\description{ +Validate a CPADS analysis data frame +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_validate_outputs_manifest.Rd b/r-package/morie/man/morie_validate_outputs_manifest.Rd new file mode 100644 index 0000000000..de1cf7e13e --- /dev/null +++ b/r-package/morie/man/morie_validate_outputs_manifest.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manifest.R +\name{morie_validate_outputs_manifest} +\alias{morie_validate_outputs_manifest} +\title{Validate outputs manifest structure} +\usage{ +morie_validate_outputs_manifest(manifest, strict = TRUE) +} +\arguments{ +\item{manifest}{Data frame to validate.} + +\item{strict}{If \code{TRUE}, stop on validation failures.} +} +\value{ +\code{TRUE} when validation passes. +} +\description{ +Validate outputs manifest structure +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/van_der_waerden_test.Rd b/r-package/morie/man/morie_van_der_waerden_test.Rd similarity index 67% rename from r-package/morie/man/van_der_waerden_test.Rd rename to r-package/morie/man/morie_van_der_waerden_test.Rd index da890df933..fd09c4fc47 100644 --- a/r-package/morie/man/van_der_waerden_test.Rd +++ b/r-package/morie/man/morie_van_der_waerden_test.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/vdwrd.R -\name{van_der_waerden_test} -\alias{van_der_waerden_test} +\name{morie_van_der_waerden_test} +\alias{morie_van_der_waerden_test} \title{Van der Waerden two-sample normal-scores location test (Gibbons Ch 8.3.2)} \usage{ -van_der_waerden_test(x, y) +morie_van_der_waerden_test(x, y) } \arguments{ \item{x, y}{Numeric vectors.} @@ -18,8 +18,5 @@ Scores a_i = qnorm(R_i / (N + 1)); statistic = sum over the first sample. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_van_der_waerden_test(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/vecm.Rd b/r-package/morie/man/morie_vecm.Rd similarity index 75% rename from r-package/morie/man/vecm.Rd rename to r-package/morie/man/morie_vecm.Rd index 74e519f7dc..17b0eb47f9 100644 --- a/r-package/morie/man/vecm.Rd +++ b/r-package/morie/man/morie_vecm.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/vecmf.R -\name{vecm} -\alias{vecm} +\name{morie_vecm} +\alias{morie_vecm} \title{Vector error-correction model (VECM)} \usage{ -vecm(Y, k_ar = 1, coint_rank = 1) +morie_vecm(Y, k_ar = 1, coint_rank = 1) } \arguments{ \item{Y}{Numeric matrix (T x k) of I(1) candidate series.} @@ -21,8 +21,5 @@ Named list with \code{alpha, beta, Gamma, Sigma, loglik, n, k, Vector error-correction model (VECM) } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_vecm(Y = matrix(rnorm(100), 50, 2)) } diff --git a/r-package/morie/man/verify_statistical_output.Rd b/r-package/morie/man/morie_verify_statistical_output.Rd similarity index 85% rename from r-package/morie/man/verify_statistical_output.Rd rename to r-package/morie/man/morie_verify_statistical_output.Rd index d30deb26a1..d546bd58df 100644 --- a/r-package/morie/man/verify_statistical_output.Rd +++ b/r-package/morie/man/morie_verify_statistical_output.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/inspector.R -\name{verify_statistical_output} -\alias{verify_statistical_output} +\name{morie_verify_statistical_output} +\alias{morie_verify_statistical_output} \title{Verify that a serialised statistical output meets minimum quality gates} \usage{ -verify_statistical_output(path) +morie_verify_statistical_output(path) } \arguments{ \item{path}{Path to a JSON output file.} @@ -29,9 +29,10 @@ tmp <- tempfile(fileext = ".json") if (requireNamespace("jsonlite", quietly = TRUE)) { jsonlite::write_json( list(ate = 0.5, se = 0.1, ci_lower = 0.3, ci_upper = 0.7, n = 200), - tmp, auto_unbox = TRUE + tmp, + auto_unbox = TRUE ) - verify_statistical_output(tmp) + morie_verify_statistical_output(tmp) unlink(tmp) } } diff --git a/r-package/morie/man/wavelet_time_series.Rd b/r-package/morie/man/morie_wavelet_time_series.Rd similarity index 70% rename from r-package/morie/man/wavelet_time_series.Rd rename to r-package/morie/man/morie_wavelet_time_series.Rd index f1618e6b1e..aaf7a6458e 100644 --- a/r-package/morie/man/wavelet_time_series.Rd +++ b/r-package/morie/man/morie_wavelet_time_series.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/wavts.R -\name{wavelet_time_series} -\alias{wavelet_time_series} +\name{morie_wavelet_time_series} +\alias{morie_wavelet_time_series} \title{Discrete wavelet decomposition for a time series} \usage{ -wavelet_time_series(x, wavelet = "haar", level = NULL) +morie_wavelet_time_series(x, wavelet = "haar", level = NULL) } \arguments{ \item{x}{Numeric univariate series.} @@ -21,8 +21,5 @@ Named list with \code{approximation, details, energies, level, Discrete wavelet decomposition for a time series } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_wavelet_time_series(x = rnorm(50)) } diff --git a/r-package/morie/man/wilcoxon_power.Rd b/r-package/morie/man/morie_wilcoxon_power.Rd similarity index 74% rename from r-package/morie/man/wilcoxon_power.Rd rename to r-package/morie/man/morie_wilcoxon_power.Rd index b4ca2253d5..77cca90b78 100644 --- a/r-package/morie/man/wilcoxon_power.Rd +++ b/r-package/morie/man/morie_wilcoxon_power.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/wsrpw.R -\name{wilcoxon_power} -\alias{wilcoxon_power} +\name{morie_wilcoxon_power} +\alias{morie_wilcoxon_power} \title{Monte-Carlo power of the Wilcoxon signed-rank test (Gibbons Ch 5.7.3)} \usage{ -wilcoxon_power(x, effect_size = 0.5, alpha = 0.05, nsim = 2000, seed = 0) +morie_wilcoxon_power(x, effect_size = 0.5, alpha = 0.05, nsim = 2000, seed = 0) } \arguments{ \item{x}{Numeric vector (only \code{length(x)} is used).} @@ -27,8 +27,5 @@ and reports the rejection rate of two-sided wilcox.test at level alpha. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_wilcoxon_power(x = rnorm(50)) } diff --git a/r-package/morie/man/morie_wilcoxon_signed_rank_test.Rd b/r-package/morie/man/morie_wilcoxon_signed_rank_test.Rd new file mode 100644 index 0000000000..ddb10ae70f --- /dev/null +++ b/r-package/morie/man/morie_wilcoxon_signed_rank_test.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inference.R +\name{morie_wilcoxon_signed_rank_test} +\alias{morie_wilcoxon_signed_rank_test} +\title{Wilcoxon signed-rank test (paired)} +\usage{ +morie_wilcoxon_signed_rank_test( + x1, + x2, + alternative = c("two.sided", "greater", "less") +) +} +\arguments{ +\item{x1}{Numeric vector (before).} + +\item{x2}{Numeric vector (after).} + +\item{alternative}{\code{"two.sided"}, \code{"greater"}, or \code{"less"}.} +} +\value{ +Named list: \code{V}, \code{p_value}. +} +\description{ +Wilcoxon signed-rank test (paired) +} +\examples{ +# See the package vignettes for usage examples: +# vignette(package = "morie") +} diff --git a/r-package/morie/man/morie_write_synthetic_data.Rd b/r-package/morie/man/morie_write_synthetic_data.Rd new file mode 100644 index 0000000000..bd22e51639 --- /dev/null +++ b/r-package/morie/man/morie_write_synthetic_data.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/synthetic.R +\name{morie_write_synthetic_data} +\alias{morie_write_synthetic_data} +\title{Write synthetic epidemiology-style data to CSV} +\usage{ +morie_write_synthetic_data( + path, + n = 5000L, + seed = 42L, + special_code_rate = 0.02, + profile = c("generic", "morie_legacy"), + name_map = NULL, + overwrite = FALSE +) +} +\arguments{ +\item{path}{Output CSV path.} + +\item{n}{Number of rows.} + +\item{seed}{Random seed.} + +\item{special_code_rate}{Proportion of survey-style missing codes.} + +\item{profile}{Naming profile for output columns.} + +\item{name_map}{Optional custom variable name map.} + +\item{overwrite}{If \code{TRUE}, overwrite existing file.} +} +\value{ +Normalized output path. +} +\description{ +Write synthetic epidemiology-style data to CSV +} +\examples{ +out <- morie_write_synthetic_data(tempfile(fileext = ".csv"), n = 200, seed = 1) +file.exists(out) +} diff --git a/r-package/morie/man/xavir_xavier_init.Rd b/r-package/morie/man/morie_xavir_xavier_init.Rd similarity index 69% rename from r-package/morie/man/xavir_xavier_init.Rd rename to r-package/morie/man/morie_xavir_xavier_init.Rd index 4e64052939..f3bf8c980e 100644 --- a/r-package/morie/man/xavir_xavier_init.Rd +++ b/r-package/morie/man/morie_xavir_xavier_init.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xavir.R -\name{xavir_xavier_init} -\alias{xavir_xavier_init} -\alias{xavier_initialization} +\name{morie_xavir_xavier_init} +\alias{morie_xavir_xavier_init} +\alias{morie_xavier_initialization} \title{Xavier / Glorot weight initialization} \usage{ -xavir_xavier_init(fan_in, fan_out, seed = 42L, uniform = TRUE) +morie_xavir_xavier_init(fan_in, fan_out, seed = 42L, uniform = TRUE) -xavier_initialization(fan_in, fan_out, seed = 42L, uniform = TRUE) +morie_xavier_initialization(fan_in, fan_out, seed = 42L, uniform = TRUE) } \arguments{ \item{fan_in}{Number of input units.} @@ -32,10 +32,8 @@ R parity for \code{morie.fn.xavir.xavier_init}. (uniform). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Glorot & Bengio (2010), AISTATS. diff --git a/r-package/morie/man/xgboost_objective.Rd b/r-package/morie/man/morie_xgboost_objective.Rd similarity index 88% rename from r-package/morie/man/xgboost_objective.Rd rename to r-package/morie/man/morie_xgboost_objective.Rd index a7ea733100..a48e656d96 100644 --- a/r-package/morie/man/xgboost_objective.Rd +++ b/r-package/morie/man/morie_xgboost_objective.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgbst.R -\name{xgboost_objective} -\alias{xgboost_objective} +\name{morie_xgboost_objective} +\alias{morie_xgboost_objective} \title{XGBoost regularized objective (R parity)} \usage{ -xgboost_objective( +morie_xgboost_objective( x, y, n_estimators = 100L, @@ -52,8 +52,5 @@ back to \code{gbm} (gradient boosting) so users still get a usable boosted-trees result; the backend is flagged in the output. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +morie_xgboost_objective(x = rnorm(50), y = rnorm(50)) } diff --git a/r-package/morie/man/mrm_anova_bonferroni.Rd b/r-package/morie/man/mrm_anova_bonferroni.Rd index dc8eb34f41..376dd6d1fe 100644 --- a/r-package/morie/man/mrm_anova_bonferroni.Rd +++ b/r-package/morie/man/mrm_anova_bonferroni.Rd @@ -30,6 +30,6 @@ df <- data.frame( g = rep(c("A", "B", "C"), each = n) ) res <- mrm_anova_bonferroni(df, response_col = "y", group_col = "g") -res$alpha_per_pair # Bonferroni-corrected per-pair alpha -res$pairs # per-pair t-tests with adjusted significance flags +res$alpha_per_pair # Bonferroni-corrected per-pair alpha +res$pairs # per-pair t-tests with adjusted significance flags } diff --git a/r-package/morie/man/mrm_anova_power.Rd b/r-package/morie/man/mrm_anova_power.Rd index 484f4e5e0f..bd426ab13e 100644 --- a/r-package/morie/man/mrm_anova_power.Rd +++ b/r-package/morie/man/mrm_anova_power.Rd @@ -25,13 +25,18 @@ Power of one-way ANOVA given Cohen's f \examples{ # Power to detect a medium effect (Cohen's f = 0.25) with 4 groups # of 30 each at alpha = 0.05: -res <- mrm_anova_power(k_groups = 4, n_per_group = 30, - effect_size_f = 0.25, alpha = 0.05) +res <- mrm_anova_power( + k_groups = 4, n_per_group = 30, + effect_size_f = 0.25, alpha = 0.05 +) res$power res$F_critical # Sample-size sensitivity: what power do I get with smaller groups? -sapply(c(10, 20, 30, 50, 100), function(n) - mrm_anova_power(k_groups = 3, n_per_group = n, - effect_size_f = 0.25)$power) +sapply(c(10, 20, 30, 50, 100), function(n) { + mrm_anova_power( + k_groups = 3, n_per_group = n, + effect_size_f = 0.25 + )$power +}) } diff --git a/r-package/morie/man/mrm_assumptions_check.Rd b/r-package/morie/man/mrm_assumptions_check.Rd index 1a49fd7432..f812d4927f 100644 --- a/r-package/morie/man/mrm_assumptions_check.Rd +++ b/r-package/morie/man/mrm_assumptions_check.Rd @@ -30,8 +30,10 @@ x <- rnorm(n) D <- rbinom(n, 1, plogis(0.5 * x)) y <- 0.7 * D + 0.3 * x + rnorm(n) df <- data.frame(D = D, y = y, age = x) -chk <- mrm_assumptions_check(df, treatment_col = "D", - outcome_col = "y", - covariates = "age") +chk <- mrm_assumptions_check(df, + treatment_col = "D", + outcome_col = "y", + covariates = "age" +) chk$overall_verdict } diff --git a/r-package/morie/man/mrm_causal_design.Rd b/r-package/morie/man/mrm_causal_design.Rd index f9c2f45007..1f272e5185 100644 --- a/r-package/morie/man/mrm_causal_design.Rd +++ b/r-package/morie/man/mrm_causal_design.Rd @@ -42,13 +42,17 @@ D <- rbinom(n, 1, plogis(0.5 * x)) y <- 0.7 * D + 0.3 * x + rnorm(n, 0, 0.5) df <- data.frame(D = D, y = y, age = x) # IPW-adjusted ATE -ipw <- mrm_causal_design(df, treatment_col = "D", - outcome_col = "y", - covariates = "age", - estimator = "ipw") +ipw <- mrm_causal_design(df, + treatment_col = "D", + outcome_col = "y", + covariates = "age", + estimator = "ipw" +) # Naive difference in means for comparison -raw <- mrm_causal_design(df, treatment_col = "D", - outcome_col = "y", - estimator = "diff_in_means") +raw <- mrm_causal_design(df, + treatment_col = "D", + outcome_col = "y", + estimator = "diff_in_means" +) c(ipw = ipw$estimate, raw = raw$estimate) } diff --git a/r-package/morie/man/mrm_check_balancing.Rd b/r-package/morie/man/mrm_check_balancing.Rd index 5b7385df96..7a2f0d154f 100644 --- a/r-package/morie/man/mrm_check_balancing.Rd +++ b/r-package/morie/man/mrm_check_balancing.Rd @@ -30,9 +30,11 @@ df <- data.frame( age = rnorm(n, 50, 10), bmi = rnorm(n, 27, 4) ) -df$age[df$D == 1] <- df$age[df$D == 1] + 3 # imbalance on age -bal <- mrm_check_balancing(df, treatment_col = "D", - covariates = c("age", "bmi")) +df$age[df$D == 1] <- df$age[df$D == 1] + 3 # imbalance on age +bal <- mrm_check_balancing(df, + treatment_col = "D", + covariates = c("age", "bmi") +) bal$overall_balanced bal$interpretation } diff --git a/r-package/morie/man/mrm_check_overlap.Rd b/r-package/morie/man/mrm_check_overlap.Rd index 2404f2a306..28c84bea6e 100644 --- a/r-package/morie/man/mrm_check_overlap.Rd +++ b/r-package/morie/man/mrm_check_overlap.Rd @@ -27,8 +27,10 @@ n <- 300L x <- rnorm(n) D <- rbinom(n, 1, plogis(0.5 * x)) df <- data.frame(D = D, age = x) -ovl <- mrm_check_overlap(df, treatment_col = "D", - covariates = "age") +ovl <- mrm_check_overlap(df, + treatment_col = "D", + covariates = "age" +) ovl$positivity_violations ovl$interpretation } diff --git a/r-package/morie/man/mrm_classify_mandela.Rd b/r-package/morie/man/mrm_classify_mandela.Rd index fec90d9998..3f3715dfd1 100644 --- a/r-package/morie/man/mrm_classify_mandela.Rd +++ b/r-package/morie/man/mrm_classify_mandela.Rd @@ -97,7 +97,8 @@ Rules). A/RES/70/175. Sprott, J. B., & Doob, A. N. (2021). Solitary Confinement, Torture, and Canada's Structured Intervention Units. Centre for Criminology and Sociolegal Studies, University of Toronto. -\url{https://www.crimsl.utoronto.ca/sites/www.crimsl.utoronto.ca/files/TortureSolitarySIUsSprottDoob23Feb2021_0.pdf} +Available at the Centre for Criminology and Sociolegal Studies +web site: crimsl.utoronto.ca (file TortureSolitarySIUsSprottDoob23Feb2021_0.pdf). Iftene, A., & Doob, A. N. (2024). Do Independent External Decision Makers Ensure that "An Inmate's Confinement in a Structured diff --git a/r-package/morie/man/mrm_clt_demo.Rd b/r-package/morie/man/mrm_clt_demo.Rd index 64c8663d0d..9dcd469541 100644 --- a/r-package/morie/man/mrm_clt_demo.Rd +++ b/r-package/morie/man/mrm_clt_demo.Rd @@ -33,10 +33,12 @@ Generate sample means from a base distribution. \examples{ # 1000 sample means of size 30 from an exponential(1) base; # standardised z-scores converge to N(0,1): -res <- mrm_clt_demo(base_distribution = "exp", - n_samples = 1000L, - sample_size = 30L, - seed = 42L, rate = 1) +res <- mrm_clt_demo( + base_distribution = "exp", + n_samples = 1000L, + sample_size = 30L, + seed = 42L, rate = 1 +) summary(res$z_score) # mean ~ 0, sd ~ 1 } diff --git a/r-package/morie/man/mrm_design.Rd b/r-package/morie/man/mrm_design.Rd index 2a288a600a..e9caa01506 100644 --- a/r-package/morie/man/mrm_design.Rd +++ b/r-package/morie/man/mrm_design.Rd @@ -3,6 +3,10 @@ \name{mrm_design} \alias{mrm_design} \title{Experimental-design callables (designexptr-inspired)} +\value{ +Each design callable returns a named \code{list} of estimates, +test statistics, p-values, and a plain-language \code{interpretation}. +} \description{ R parity of \code{morie.mrm_design} (Python). Four general- purpose statistical-design entry points covering the @@ -11,6 +15,12 @@ one-way ANOVA with Tukey HSD, 2^k factorial design, and a designed-experiment convenience wrapper around the morie causal estimator family. } +\examples{ +set.seed(2026) +a <- rnorm(40, mean = 5, sd = 1.2) +b <- rnorm(40, mean = 5.5, sd = 1.5) +mrm_two_treatment_test(a, b)$p_welch +} \references{ Box, G. E. P., Hunter, J. S., & Hunter, W. G. (2005). Statistics for Experimenters. Wiley. diff --git a/r-package/morie/man/mrm_diagnostics.Rd b/r-package/morie/man/mrm_diagnostics.Rd index 41bad8e3a7..722dc042d5 100644 --- a/r-package/morie/man/mrm_diagnostics.Rd +++ b/r-package/morie/man/mrm_diagnostics.Rd @@ -3,10 +3,27 @@ \name{mrm_diagnostics} \alias{mrm_diagnostics} \title{Causal-inference diagnostics (R parity)} +\value{ +Each diagnostic callable returns a named \code{list} of balance +and overlap statistics (or the estimated effect) together with a +plain-language \code{interpretation}. +} \description{ Balance, overlap, SUTVA-style assumption checks, and the median causal effect estimator. R parity of \code{morie.mrm_diagnostics}. } +\examples{ +set.seed(2026) +n <- 200L +df <- data.frame( + D = rbinom(n, 1, 0.4), + age = rnorm(n, 50, 10), bmi = rnorm(n, 27, 4) +) +mrm_standardised_difference(df, + treatment_col = "D", + covariates = c("age", "bmi") +) +} \references{ Imbens, G. W., & Rubin, D. B. (2015). Causal Inference for Statistics, Social and Biomedical Sciences. Cambridge University Press. diff --git a/r-package/morie/man/mrm_doe.Rd b/r-package/morie/man/mrm_doe.Rd index 2e3a2646a3..fe7b8e3e31 100644 --- a/r-package/morie/man/mrm_doe.Rd +++ b/r-package/morie/man/mrm_doe.Rd @@ -3,10 +3,24 @@ \name{mrm_doe} \alias{mrm_doe} \title{Design-of-Experiments toolkit (R parity)} +\value{ +Each design-of-experiments callable returns a named \code{list} +holding the constructed design or the analysis result and a +plain-language \code{interpretation}. +} \description{ R parity of \code{morie.mrm_doe}. Closes the Chapter-3/4/5 coverage gap from designexptr.org. } +\examples{ +set.seed(2026) +n <- 30L +df <- data.frame( + y = c(rnorm(n, 0), rnorm(n, 0.5), rnorm(n, 1)), + g = rep(c("A", "B", "C"), each = n) +) +mrm_anova_bonferroni(df, response_col = "y", group_col = "g")$alpha_per_pair +} \references{ Box, G. E. P., Hunter, J. S., & Hunter, W. G. (2005). Statistics for Experimenters (2nd ed.). Wiley. diff --git a/r-package/morie/man/mrm_factorial_2k.Rd b/r-package/morie/man/mrm_factorial_2k.Rd index f4554213b9..a039aefb68 100644 --- a/r-package/morie/man/mrm_factorial_2k.Rd +++ b/r-package/morie/man/mrm_factorial_2k.Rd @@ -31,8 +31,10 @@ set.seed(2026) lvl <- c(-1, 1) df <- expand.grid(A = lvl, B = lvl, C = lvl) df$y <- 10 + 2 * df$A + 1.5 * df$B + 0.5 * df$A * df$B + rnorm(8, 0, 0.2) -res <- mrm_factorial_2k(df, response_col = "y", - factor_cols = c("A", "B", "C")) +res <- mrm_factorial_2k(df, + response_col = "y", + factor_cols = c("A", "B", "C") +) res$main_effects res$interaction_effects } diff --git a/r-package/morie/man/mrm_fractional_factorial.Rd b/r-package/morie/man/mrm_fractional_factorial.Rd index 49ee8e1910..fe41667e0d 100644 --- a/r-package/morie/man/mrm_fractional_factorial.Rd +++ b/r-package/morie/man/mrm_fractional_factorial.Rd @@ -32,7 +32,9 @@ df <- data.frame( C = c(1, -1, -1, 1) ) df$y <- 5 + 2 * df$A + 1.5 * df$B + rnorm(4, 0, 0.3) -res <- mrm_fractional_factorial(df, response_col = "y", - factor_cols = c("A", "B", "C")) +res <- mrm_fractional_factorial(df, + response_col = "y", + factor_cols = c("A", "B", "C") +) res$main_effects } diff --git a/r-package/morie/man/mrm_graeco_latin.Rd b/r-package/morie/man/mrm_graeco_latin.Rd index 1c3b7886cb..ff6df68ba1 100644 --- a/r-package/morie/man/mrm_graeco_latin.Rd +++ b/r-package/morie/man/mrm_graeco_latin.Rd @@ -20,22 +20,28 @@ Graeco-Latin square four-way ANOVA (row, col, Latin, Greek) \examples{ # Hardcoded 4 x 4 orthogonal Graeco-Latin square (two random Latin # squares are generally NOT orthogonal, so we use a known pair): -L <- matrix(c("A","B","C","D", - "B","A","D","C", - "C","D","A","B", - "D","C","B","A"), nrow = 4L, byrow = TRUE) -G <- matrix(c("a","b","c","d", - "c","d","a","b", - "d","c","b","a", - "b","a","d","c"), nrow = 4L, byrow = TRUE) +L <- matrix(c( + "A", "B", "C", "D", + "B", "A", "D", "C", + "C", "D", "A", "B", + "D", "C", "B", "A" +), nrow = 4L, byrow = TRUE) +G <- matrix(c( + "a", "b", "c", "d", + "c", "d", "a", "b", + "d", "c", "b", "a", + "b", "a", "d", "c" +), nrow = 4L, byrow = TRUE) set.seed(2026) df <- expand.grid(row = paste0("R", 1:4), col = paste0("C", 1:4)) df$latin <- as.vector(L) df$greek <- as.vector(G) df$y <- match(df$latin, LETTERS) * 1.2 + match(df$greek, letters) * 0.5 + rnorm(16, 0, 0.3) -res <- mrm_graeco_latin(df, response_col = "y", - row_col = "row", col_col = "col", - latin_col = "latin", greek_col = "greek") +res <- mrm_graeco_latin(df, + response_col = "y", + row_col = "row", col_col = "col", + latin_col = "latin", greek_col = "greek" +) res$anova } diff --git a/r-package/morie/man/mrm_kulldorff.Rd b/r-package/morie/man/mrm_kulldorff.Rd index baffaad367..8265d455a3 100644 --- a/r-package/morie/man/mrm_kulldorff.Rd +++ b/r-package/morie/man/mrm_kulldorff.Rd @@ -3,6 +3,12 @@ \name{mrm_kulldorff} \alias{mrm_kulldorff} \title{Kulldorff space-time scan statistic on TPS event data} +\value{ +\code{mrm_tps_kulldorff_scan()} returns a named \code{list} with +the most likely cluster, its Poisson log-likelihood-ratio statistic, +the Monte-Carlo permutation p-value, and a plain-language +\code{interpretation}. +} \description{ R parity of \code{morie.mrm_kulldorff.mrm_tps_kulldorff_scan()}. Implements Kulldorff's 1997 Poisson log-likelihood-ratio space-time @@ -14,6 +20,12 @@ computing the Poisson LRT against \eqn{H_0} (events uniformly distributed in space and time). The maximum LRT is the test statistic; permutations of event timestamps generate the null. } +\examples{ +if (FALSE) { + tps <- morie_sample("tps_assault") + mrm_tps_kulldorff_scan(tps, n_permutations = 49) +} +} \references{ Kulldorff, M. (1997). A spatial scan statistic. Communications in Statistics: Theory and Methods, 26(6), 1481--1496. diff --git a/r-package/morie/man/mrm_latin_square.Rd b/r-package/morie/man/mrm_latin_square.Rd index afa0b69211..ee3b616e7e 100644 --- a/r-package/morie/man/mrm_latin_square.Rd +++ b/r-package/morie/man/mrm_latin_square.Rd @@ -26,8 +26,10 @@ df <- expand.grid(row = paste0("R", 1:4), col = paste0("C", 1:4)) df$treatment <- LETTERS[as.integer(as.vector(sq)) + 1L] set.seed(2026) df$y <- match(df$treatment, LETTERS) * 1.5 + rnorm(16, 0, 0.4) -res <- mrm_latin_square(df, response_col = "y", - row_col = "row", col_col = "col", - treatment_col = "treatment") +res <- mrm_latin_square(df, + response_col = "y", + row_col = "row", col_col = "col", + treatment_col = "treatment" +) res$anova } diff --git a/r-package/morie/man/mrm_lisa.Rd b/r-package/morie/man/mrm_lisa.Rd index bc08015d0a..2604342d28 100644 --- a/r-package/morie/man/mrm_lisa.Rd +++ b/r-package/morie/man/mrm_lisa.Rd @@ -4,6 +4,11 @@ \alias{mrm_lisa} \title{LISA (Local Indicators of Spatial Association) on polygon-level crime data + per-year polygon Moran's I time series} +\value{ +The LISA callables return named \code{list}s with per-polygon +local Moran's I, permutation p-values, cluster classifications, and +(for the per-year wrapper) the time series of global Moran's I. +} \description{ R parity of \code{morie.mrm_tps_lisa()} and \code{morie.mrm_tps_polygon_moran_per_year()}. Local Moran's I per @@ -11,6 +16,15 @@ polygon centroid with 999-permutation MC significance, plus a convenience wrapper for the per-year time series used by the morie empirical paper Section 7.11. } +\examples{ +if (FALSE) { + ncr <- read.csv("Neighbourhood_Crime_Rates_Open_Data.csv") + mrm_tps_lisa(ncr, + count_col = "ASSAULT_2024", + lat_col = "lat", lon_col = "lon" + ) +} +} \references{ Anselin, L. (1995). Local indicators of spatial association -- LISA. \emph{Geographical Analysis}, 27(2), 93--115. diff --git a/r-package/morie/man/mrm_mathstats.Rd b/r-package/morie/man/mrm_mathstats.Rd index eef45aa1ec..08cbc65e14 100644 --- a/r-package/morie/man/mrm_mathstats.Rd +++ b/r-package/morie/man/mrm_mathstats.Rd @@ -3,11 +3,18 @@ \name{mrm_mathstats} \alias{mrm_mathstats} \title{Mathematical-statistics / simulation / computation toolkit (R parity)} +\value{ +Each callable returns a named \code{list} with the computed +statistic(s) and a plain-language \code{interpretation}. +} \description{ R parity of \code{morie.mrm_mathstats}. Closes the Chapter-2 coverage gap from designexptr.org/mathematical-statistics- simulation-and-computation.html. } +\examples{ +mrm_oneprop_test(x = 58, n = 100, p0 = 0.5) +} \references{ Wilks, S. S. (1962). Mathematical Statistics. Wiley. Casella, G. & Berger, R. L. (2002). Statistical Inference. Duxbury. diff --git a/r-package/morie/man/mrm_mc_power.Rd b/r-package/morie/man/mrm_mc_power.Rd index f93002619a..26fd6af509 100644 --- a/r-package/morie/man/mrm_mc_power.Rd +++ b/r-package/morie/man/mrm_mc_power.Rd @@ -31,5 +31,6 @@ my_sim <- function(seed) { } res <- mrm_mc_power(my_sim, n_sims = 500L, alpha = 0.05) res$empirical_power -res$ci95_lower; res$ci95_upper +res$ci95_lower +res$ci95_upper } diff --git a/r-package/morie/man/mrm_median_causal_effect.Rd b/r-package/morie/man/mrm_median_causal_effect.Rd index 4dfdca874b..bcd6a944fc 100644 --- a/r-package/morie/man/mrm_median_causal_effect.Rd +++ b/r-package/morie/man/mrm_median_causal_effect.Rd @@ -29,9 +29,11 @@ x <- rnorm(n) D <- rbinom(n, 1, plogis(0.5 * x)) y <- 0.7 * D + 0.3 * x + rnorm(n, 0, 0.5) df <- data.frame(D = D, y = y, age = x) -res <- mrm_median_causal_effect(df, treatment_col = "D", - outcome_col = "y", - covariates = "age") +res <- mrm_median_causal_effect(df, + treatment_col = "D", + outcome_col = "y", + covariates = "age" +) res$median_treatment_effect res$n_matched } diff --git a/r-package/morie/man/mrm_otis.Rd b/r-package/morie/man/mrm_otis.Rd index 12c3ea75af..293823ed0b 100644 --- a/r-package/morie/man/mrm_otis.Rd +++ b/r-package/morie/man/mrm_otis.Rd @@ -3,6 +3,11 @@ \name{mrm_otis} \alias{mrm_otis} \title{MRM-framework analyses on Ontario OTIS data} +\value{ +Each \code{mrm_otis_*()} callable returns a named \code{list} with +the computed statistics (concentration indices, survival curves, or +association measures) and a plain-language \code{interpretation}. +} \description{ Five callables for the OTIS (Offender Tracking Information System) public-release datasets, used in the MRM (Multilevel Reconciliation @@ -31,3 +36,9 @@ The OTIS \code{UniqueIndividual_ID} column has format \code{YYYY-XXXXX-SG} and is randomly reassigned every fiscal year. Cross-year tracking is therefore invalid; all analyses below operate within fiscal year. } +\examples{ +if (FALSE) { + b09 <- read.csv("b09_individuals_in_segregation.csv") + mrm_otis_placement_concentration(b09) +} +} diff --git a/r-package/morie/man/mrm_otis_mandela_spectrum.Rd b/r-package/morie/man/mrm_otis_mandela_spectrum.Rd index 6ca8afe5d2..ae870ef2a6 100644 --- a/r-package/morie/man/mrm_otis_mandela_spectrum.Rd +++ b/r-package/morie/man/mrm_otis_mandela_spectrum.Rd @@ -80,5 +80,6 @@ if (FALSE) { \references{ United Nations General Assembly (2015). United Nations Standard Minimum Rules for the Treatment of Prisoners (the Nelson Mandela -Rules). A/RES/70/175. Rule 43 = prolonged (>15 days). Rule 44 ==22 hours/day, no meaningful human contact. +Rules). A/RES/70/175. Rule 43 = prolonged (more than 15 days). +Rule 44 = at least 22 hours/day, no meaningful human contact. } diff --git a/r-package/morie/man/mrm_otis_mortification_cooccurrence.Rd b/r-package/morie/man/mrm_otis_mortification_cooccurrence.Rd index d1ee7b2484..568d7f8069 100644 --- a/r-package/morie/man/mrm_otis_mortification_cooccurrence.Rd +++ b/r-package/morie/man/mrm_otis_mortification_cooccurrence.Rd @@ -17,7 +17,7 @@ mrm_otis_mortification_cooccurrence( } \value{ A data.frame with one row per pair, columns \code{alert_a}, -\code{alert_b}, \code{n}, \code{chi2}, \code{df}, \code{p_value}, \code{cramers_v}. +\code{alert_b}, \code{n}, \code{chi2}, \code{df}, \code{p_value}, \code{morie_cramers_v}. } \description{ Computes the pairwise Cramer's V (and chi-square test) for every diff --git a/r-package/morie/man/mrm_otis_region_locality.Rd b/r-package/morie/man/mrm_otis_region_locality.Rd index 95dffc4744..4cc1cf7976 100644 --- a/r-package/morie/man/mrm_otis_region_locality.Rd +++ b/r-package/morie/man/mrm_otis_region_locality.Rd @@ -21,7 +21,7 @@ mrm_otis_region_locality( } \value{ A list with named elements \code{table} (the contingency matrix), -\code{chi2}, \code{df}, \code{p_value}, \code{cramers_v}, \code{diagonal_share}, +\code{chi2}, \code{df}, \code{p_value}, \code{morie_cramers_v}, \code{diagonal_share}, \code{off_diagonal_share}. } \description{ diff --git a/r-package/morie/man/mrm_perm_block.Rd b/r-package/morie/man/mrm_perm_block.Rd index ad6e71b9a0..cf954a6856 100644 --- a/r-package/morie/man/mrm_perm_block.Rd +++ b/r-package/morie/man/mrm_perm_block.Rd @@ -31,15 +31,19 @@ Permutes treatment labels within each block. } \examples{ set.seed(2026) -df <- expand.grid(block = paste0("B", 1:6), - treatment = c("ctrl", "drug")) +df <- expand.grid( + block = paste0("B", 1:6), + treatment = c("ctrl", "drug") +) # Block-level baseline + treatment effect df$y <- as.numeric(df$block) * 1.2 + ifelse(df$treatment == "drug", 0.7, 0) + rnorm(nrow(df), 0, 0.4) -res <- mrm_perm_block(df, response_col = "y", - treatment_col = "treatment", - block_col = "block", - n_perm = 500L) +res <- mrm_perm_block(df, + response_col = "y", + treatment_col = "treatment", + block_col = "block", + n_perm = 500L +) res$p_value } diff --git a/r-package/morie/man/mrm_pit.Rd b/r-package/morie/man/mrm_pit.Rd index 348ab7f8ba..327f95ee44 100644 --- a/r-package/morie/man/mrm_pit.Rd +++ b/r-package/morie/man/mrm_pit.Rd @@ -27,8 +27,8 @@ set.seed(2026) x <- rnorm(200) # Under correct distributional assumption, U should be ~Uniform(0,1): pit <- mrm_pit(x, dist = "norm") -attr(pit, "ks_pvalue") # large p-value => no evidence against fit +attr(pit, "ks_pvalue") # large p-value => no evidence against fit # If we deliberately misspecify (claim t_3 fits the normal sample): pit_wrong <- mrm_pit(x, dist = "t", df = 3) -attr(pit_wrong, "ks_pvalue") # small p-value => misspecification detected +attr(pit_wrong, "ks_pvalue") # small p-value => misspecification detected } diff --git a/r-package/morie/man/mrm_random_latin.Rd b/r-package/morie/man/mrm_random_latin.Rd index 0c256ef692..d6e76e4fd7 100644 --- a/r-package/morie/man/mrm_random_latin.Rd +++ b/r-package/morie/man/mrm_random_latin.Rd @@ -26,6 +26,8 @@ and symbols. Uniform over a subset of Latin squares (not all). mrm_random_latin(k = 4, seed = 42L) # Reproducible across runs with the same seed: -identical(mrm_random_latin(5, seed = 7), - mrm_random_latin(5, seed = 7)) +identical( + mrm_random_latin(5, seed = 7), + mrm_random_latin(5, seed = 7) +) } diff --git a/r-package/morie/man/mrm_rcbd.Rd b/r-package/morie/man/mrm_rcbd.Rd index 013fcbf834..509e7a6e6f 100644 --- a/r-package/morie/man/mrm_rcbd.Rd +++ b/r-package/morie/man/mrm_rcbd.Rd @@ -21,12 +21,16 @@ Returns Type-I ANOVA: block enters first, then treatment. } \examples{ set.seed(2026) -df <- expand.grid(treatment = c("A", "B", "C"), - block = c("B1", "B2", "B3", "B4")) +df <- expand.grid( + treatment = c("A", "B", "C"), + block = c("B1", "B2", "B3", "B4") +) # Treatment effect + block effect + noise df$y <- as.numeric(df$treatment) * 2 + as.numeric(df$block) * 0.5 + rnorm(nrow(df), 0, 0.3) -res <- mrm_rcbd(df, response_col = "y", - treatment_col = "treatment", block_col = "block") +res <- mrm_rcbd(df, + response_col = "y", + treatment_col = "treatment", block_col = "block" +) res$anova } diff --git a/r-package/morie/man/mrm_response_surface.Rd b/r-package/morie/man/mrm_response_surface.Rd index 43dafe18c2..de892f4106 100644 --- a/r-package/morie/man/mrm_response_surface.Rd +++ b/r-package/morie/man/mrm_response_surface.Rd @@ -24,12 +24,16 @@ returns the stationary point if the quadratic matrix B is invertible. \examples{ # Central composite design on (x1, x2) with quadratic response. set.seed(2026) -df <- expand.grid(x1 = c(-1.4, -1, 0, 1, 1.4), - x2 = c(-1.4, -1, 0, 1, 1.4)) +df <- expand.grid( + x1 = c(-1.4, -1, 0, 1, 1.4), + x2 = c(-1.4, -1, 0, 1, 1.4) +) df$y <- 10 + 2 * df$x1 + 1.5 * df$x2 - df$x1^2 - 1.2 * df$x2^2 + rnorm(nrow(df), 0, 0.2) -res <- mrm_response_surface(df, response_col = "y", - factor_cols = c("x1", "x2")) +res <- mrm_response_surface(df, + response_col = "y", + factor_cols = c("x1", "x2") +) res$stationary_point res$stationary_nature } diff --git a/r-package/morie/man/mrm_samples.Rd b/r-package/morie/man/mrm_samples.Rd index 381478f75d..25855e6235 100644 --- a/r-package/morie/man/mrm_samples.Rd +++ b/r-package/morie/man/mrm_samples.Rd @@ -3,6 +3,12 @@ \name{mrm_samples} \alias{mrm_samples} \title{Bundled reference data samples and dataset fetchers} +\value{ +The on-demand fetchers (\code{morie_fetch_tps()}, +\code{morie_fetch_siu()}) return the file path to the downloaded or +cached CSV; \code{morie_load_dataset()} returns the loaded +\code{data.frame}. +} \description{ MORIE ships a small set of reference CSVs in \verb{inst/extdata/} so that the \verb{mrm_otis_*()} and \verb{mrm_tps_*()} callables can be exercised @@ -18,8 +24,14 @@ CKAN fetcher. \item TPS: Toronto Police Open Data ArcGIS REST. Use \code{morie_fetch_tps(category = "Assault")}. \item SIU: Ontario SIU Director's Reports site. Use -\code{morie_fetch_siu()} which scrapes the public reports site on -demand (per-user, since redistribution of the scraped corpus is +\code{morie_fetch_siu()} which parses the public reports site on +demand (per-user, since redistribution of the parsed corpus is not clearly licensed). } } +\examples{ +if (FALSE) { + b01 <- morie_load_dataset("otisb01") + head(b01) +} +} diff --git a/r-package/morie/man/mrm_siu.Rd b/r-package/morie/man/mrm_siu.Rd index e9fbc2f778..a1bb04323a 100644 --- a/r-package/morie/man/mrm_siu.Rd +++ b/r-package/morie/man/mrm_siu.Rd @@ -3,6 +3,11 @@ \name{mrm_siu} \alias{mrm_siu} \title{MRM-framework analyses on Ontario SIU (Special Investigations Unit) data} +\value{ +Each \code{mrm_siu_*()} callable returns a named \code{list} with +the survival, per-service rate, or outcome-classification result and a +plain-language \code{interpretation}. +} \description{ Three callables for SIU case-level CSVs. Unlike OTIS (no placement dates) and TPS (no per-person ID), SIU exposes per-case dates with a @@ -23,3 +28,9 @@ categories (\code{charges_laid}, \code{no_charges}, etc.) by service and by year, reporting both raw counts and shares. } } +\examples{ +if (FALSE) { + siu <- read.csv("SIU.csv") + mrm_siu_case_to_decision_km(siu) +} +} diff --git a/r-package/morie/man/mrm_standardised_difference.Rd b/r-package/morie/man/mrm_standardised_difference.Rd index 779a00854c..0b1064e304 100644 --- a/r-package/morie/man/mrm_standardised_difference.Rd +++ b/r-package/morie/man/mrm_standardised_difference.Rd @@ -30,7 +30,9 @@ df <- data.frame( age = rnorm(n, 50, 10), bmi = rnorm(n, 27, 4) ) -df$age[df$D == 1] <- df$age[df$D == 1] + 3 # deliberate imbalance -mrm_standardised_difference(df, treatment_col = "D", - covariates = c("age", "bmi")) +df$age[df$D == 1] <- df$age[df$D == 1] + 3 # deliberate imbalance +mrm_standardised_difference(df, + treatment_col = "D", + covariates = c("age", "bmi") +) } diff --git a/r-package/morie/man/mrm_tps.Rd b/r-package/morie/man/mrm_tps.Rd index 165454c2a0..fa8c67cdef 100644 --- a/r-package/morie/man/mrm_tps.Rd +++ b/r-package/morie/man/mrm_tps.Rd @@ -3,6 +3,12 @@ \name{mrm_tps} \alias{mrm_tps} \title{MRM-framework analyses on Toronto Police Service (TPS) open data} +\value{ +Each \code{mrm_tps_*()} callable returns a named \code{list} with +the computed statistic (Pareto exponent, Moran's I, or survival curve) +and a plain-language \code{interpretation}; \code{mrm_tps_load_hawkes_refit()} +returns the parsed Hawkes-refit manifest as a \code{list}. +} \description{ Four callables for TPS public-release crime-incident CSVs, used in the MRM empirical companion paper. @@ -21,3 +27,9 @@ precomputed per-category Hawkes (Markovian + Weibull/sin) fits from the \code{paper_hawkes_refit.json} manifest if available. } } +\examples{ +if (FALSE) { + tps <- read.csv("Assault_Open_Data.csv") + mrm_tps_levy_scaling(tps) +} +} diff --git a/r-package/morie/man/mrm_tps_lisa.Rd b/r-package/morie/man/mrm_tps_lisa.Rd index 14f5de6c0b..295e6a6b0e 100644 --- a/r-package/morie/man/mrm_tps_lisa.Rd +++ b/r-package/morie/man/mrm_tps_lisa.Rd @@ -46,7 +46,9 @@ significance \examples{ if (FALSE) { ncr <- read.csv("Neighbourhood_Crime_Rates_Open_Data.csv") - res <- mrm_tps_lisa(ncr, count_col = "ASSAULT_2024", - lat_col = "lat", lon_col = "lon") + res <- mrm_tps_lisa(ncr, + count_col = "ASSAULT_2024", + lat_col = "lat", lon_col = "lon" + ) } } diff --git a/r-package/morie/man/mrm_tps_polygon_moran_per_year.Rd b/r-package/morie/man/mrm_tps_polygon_moran_per_year.Rd index c529b987d4..ffcc99c9ac 100644 --- a/r-package/morie/man/mrm_tps_polygon_moran_per_year.Rd +++ b/r-package/morie/man/mrm_tps_polygon_moran_per_year.Rd @@ -34,8 +34,10 @@ of per-year count columns. \examples{ # 4 x 4 polygon grid with two yearly count columns. set.seed(2026) -grid <- expand.grid(lat = 43.6 + (0:3) * 0.02, - lon = -79.4 + (0:3) * 0.02) +grid <- expand.grid( + lat = 43.6 + (0:3) * 0.02, + lon = -79.4 + (0:3) * 0.02 +) grid$ASSAULT_2023 <- rpois(nrow(grid), lambda = grid$lat * 10) grid$ASSAULT_2024 <- rpois(nrow(grid), lambda = grid$lat * 12) res <- mrm_tps_polygon_moran_per_year( diff --git a/r-package/morie/man/mrm_two_treatment_test.Rd b/r-package/morie/man/mrm_two_treatment_test.Rd index e73d18282a..c1e176dc3f 100644 --- a/r-package/morie/man/mrm_two_treatment_test.Rd +++ b/r-package/morie/man/mrm_two_treatment_test.Rd @@ -24,10 +24,10 @@ range. } \examples{ set.seed(2026) -a <- rnorm(40, mean = 5, sd = 1.2) +a <- rnorm(40, mean = 5, sd = 1.2) b <- rnorm(40, mean = 5.5, sd = 1.5) res <- mrm_two_treatment_test(a, b) -res$estimate # mean(a) - mean(b) -res$p_welch # canonical p-value +res$estimate # mean(a) - mean(b) +res$p_welch # canonical p-value res$p_mannwhitney # rank-based sensitivity check } diff --git a/r-package/morie/man/nstat.Rd b/r-package/morie/man/nstat.Rd index db70816e68..a42e42d577 100644 --- a/r-package/morie/man/nstat.Rd +++ b/r-package/morie/man/nstat.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/nstat.R \name{nstat} \alias{nstat} -\alias{nonstationary_covariance} +\alias{morie_nonstationary_covariance} \title{Non-stationary covariance estimation (moving-window kernel).} \usage{ nstat(x, coords, bandwidth = NULL) -nonstationary_covariance(x, coords, bandwidth = NULL) +morie_nonstationary_covariance(x, coords, bandwidth = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -25,10 +25,7 @@ C(s_i, s_j) = sigma(s_i) sigma(s_j) rho(s_i, s_j) with kernel-weighted local moments and standardised residual correlations. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +nstat(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Sampson & Guttorp (1992); Schabenberger & Gotway (2005), Ch 8. diff --git a/r-package/morie/man/odds_ratio_ci.Rd b/r-package/morie/man/odds_ratio_ci.Rd deleted file mode 100644 index 4c590a611e..0000000000 --- a/r-package/morie/man/odds_ratio_ci.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{odds_ratio_ci} -\alias{odds_ratio_ci} -\title{Odds ratio and 95\% CI from a 2x2 contingency table} -\description{ -Odds ratio and 95\% CI from a 2x2 contingency table. -} -\usage{ - odds_ratio_ci(table_2x2, alpha) -} -\arguments{ - \item{table_2x2}{A 2x2 matrix: rows are treatment, columns are outcome.} - \item{alpha}{Significance level.} -} -\value{ - Named list: 'or', 'ci_lower', 'ci_upper', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/okrig.Rd b/r-package/morie/man/okrig.Rd index 9160ad3964..dc4ecb9cd1 100644 --- a/r-package/morie/man/okrig.Rd +++ b/r-package/morie/man/okrig.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/okrig.R \name{okrig} \alias{okrig} -\alias{ordinary_kriging} +\alias{morie_ordinary_kriging} \title{Ordinary kriging prediction (exponential / gaussian / spherical).} \usage{ okrig( @@ -15,7 +15,7 @@ okrig( range_ = 1 ) -ordinary_kriging( +morie_ordinary_kriging( x, coords, target, @@ -46,10 +46,7 @@ Named list: estimate, se, n, method. \begin{bmatrix}c_0 \\ 1\end{bmatrix}} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +okrig(x = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) } \references{ Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/omega_squared.Rd b/r-package/morie/man/omega_squared.Rd deleted file mode 100644 index 93965688ee..0000000000 --- a/r-package/morie/man/omega_squared.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{omega_squared} -\alias{omega_squared} -\title{Omega-squared (less biased than eta-squared)} -\description{ -Compute omega-squared as a less-biased effect-size estimator than -eta-squared for one-way ANOVA designs. -} -\usage{ -omega_squared(f_stat, df_between, df_within, n) -} -\arguments{ -\item{f_stat}{The F statistic from the one-way ANOVA.} -\item{df_between}{Between-groups degrees of freedom.} -\item{df_within}{Within-groups (residual) degrees of freedom.} -\item{n}{Total sample size.} -} -\value{ -Numeric omega-squared. -} -\examples{ -omega_squared(f_stat = 5.2, df_between = 2, df_within = 87, n = 90) -} diff --git a/r-package/morie/man/one_sample_t_test.Rd b/r-package/morie/man/one_sample_t_test.Rd deleted file mode 100644 index 0ce3a85c4b..0000000000 --- a/r-package/morie/man/one_sample_t_test.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{one_sample_t_test} -\alias{one_sample_t_test} -\title{One-sample t-test} -\description{ - One-sample t-test -} -\usage{ - one_sample_t_test(x, mu0, alternative) -} -\arguments{ - \item{x}{Numeric vector.} - \item{mu0}{Null hypothesis mean (default 0).} - \item{alternative}{'"two.sided"', '"greater"', or '"less"'.} -} -\value{ - Named list: 't', 'df', 'p_value', 'ci'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/optcl.Rd b/r-package/morie/man/optcl.Rd index 7454c46028..34b9e6d2a9 100644 --- a/r-package/morie/man/optcl.Rd +++ b/r-package/morie/man/optcl.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/optcl.R \name{optcl} \alias{optcl} -\alias{optimal_classification} +\alias{morie_optimal_classification} \title{Optimal Classification cutting point (Poole 2000; Armstrong Ch 3)} \usage{ optcl(x, votes = NULL) -optimal_classification(x, votes = NULL) +morie_optimal_classification(x, votes = NULL) } \arguments{ \item{x}{Numeric vector of ideal points.} @@ -24,9 +24,6 @@ of a binary vote vector. Reports PRE (proportional reduction in error) against the modal-class baseline. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +optcl(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/ordered_alternatives_test.Rd b/r-package/morie/man/ordered_alternatives_test.Rd deleted file mode 100644 index e9685af085..0000000000 --- a/r-package/morie/man/ordered_alternatives_test.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ordlt.R, R/ordlt_jonckheere.R -\name{ordered_alternatives_test} -\alias{ordered_alternatives_test} -\title{Jonckheere-Terpstra ordered-alternatives test (Gibbons & Chakraborti Ch 10.6)} -\usage{ -ordered_alternatives_test(groups) - -ordered_alternatives_test(groups) -} -\arguments{ -\item{groups}{List of numeric vectors in monotone hypothesised order.} -} -\value{ -Named list with \code{statistic} (z), \code{p_value}, \code{J}, -\code{E_J}, \code{Var_J}, \code{n}, \code{method}. - -Named list: statistic, p_value, z, E_J, Var_J, n, k, method. -} -\description{ -R parity for \code{morie.fn.ordlt.ordered_alternatives_test}. The -Python module also exports a proportional-odds \code{ordered_logit} -estimator (kept as a separate R callable in a future release). - -Tests H0: F_1 = ... = F_k against the ordered alternative -H1: F_1 <= F_2 <= ... <= F_k. J = sum over i 0.5 +set.seed(0) +fs <- 100 +t <- seq(0, 10, length.out = 1024) +a <- sin(2 * pi * 10 * t) +b <- a + 0.1 * rnorm(length(t)) +rgcoh(a, b, fs = fs)$peak_coherence > 0.5 } } \references{ diff --git a/r-package/morie/man/rgcrl.Rd b/r-package/morie/man/rgcrl.Rd index 806c5950b4..02c054348a 100644 --- a/r-package/morie/man/rgcrl.Rd +++ b/r-package/morie/man/rgcrl.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgcrl.R \name{rgcrl} \alias{rgcrl} -\alias{rangayyan_correlation_dimension} +\alias{morie_rangayyan_correlation_dimension} \title{Correlation dimension (Grassberger-Procaccia) -- Rangayyan Ch 7} \usage{ rgcrl(x, m = 3L, tau = 1L, n_r = 20L) -rangayyan_correlation_dimension(x, m = 3L, tau = 1L, n_r = 20L) +morie_rangayyan_correlation_dimension(x, m = 3L, tau = 1L, n_r = 20L) } \arguments{ \item{x}{Numeric vector.} @@ -25,7 +25,8 @@ Named list \code{D2}, \code{log_r}, \code{log_C}, \code{m}, \code{tau}. Slope of \eqn{\log C(r)} vs \eqn{\log r} in the middle scaling region. } \examples{ -set.seed(0); rgcrl(rnorm(200), m = 3, tau = 1, n_r = 15)$D2 +set.seed(0) +rgcrl(rnorm(200), m = 3, tau = 1, n_r = 15)$D2 } \references{ Grassberger & Procaccia (1983), Physica D 9:189. diff --git a/r-package/morie/man/rgdfa.Rd b/r-package/morie/man/rgdfa.Rd index fcc93955e9..11ed3dbf71 100644 --- a/r-package/morie/man/rgdfa.Rd +++ b/r-package/morie/man/rgdfa.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgdfa.R \name{rgdfa} \alias{rgdfa} -\alias{rangayyan_dfa} +\alias{morie_rangayyan_dfa} \title{Detrended fluctuation analysis -- Rangayyan Ch 7} \usage{ rgdfa(x, scales = NULL, order = 1L) -rangayyan_dfa(x, scales = NULL, order = 1L) +morie_rangayyan_dfa(x, scales = NULL, order = 1L) } \arguments{ \item{x}{Numeric vector.} @@ -17,13 +17,14 @@ rangayyan_dfa(x, scales = NULL, order = 1L) \item{order}{Detrending polynomial order (default 1 = DFA-1).} } \value{ -Named list \code{alpha}, \code{scales}, \code{F}, \code{log_scales}, \code{log_F}. +Named list \code{alpha}, \code{scales}, \code{fluct}, \code{log_scales}, \code{log_F}. } \description{ DFA-alpha scaling exponent (Peng et al. 1994). } \examples{ -set.seed(0); rgdfa(rnorm(500))$alpha +set.seed(0) +rgdfa(rnorm(500))$alpha } \references{ Peng et al. (1994), Phys Rev E 49:1685; Rangayyan Ch 7. diff --git a/r-package/morie/man/rgeeg.Rd b/r-package/morie/man/rgeeg.Rd index f92e72ae95..d428647687 100644 --- a/r-package/morie/man/rgeeg.Rd +++ b/r-package/morie/man/rgeeg.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgeeg.R \name{rgeeg} \alias{rgeeg} -\alias{rangayyan_eeg_bands} +\alias{morie_rangayyan_eeg_bands} \title{EEG band power (delta theta alpha beta gamma) via Welch -- Rangayyan Ch 9} \usage{ rgeeg(x, fs, bands = NULL, nperseg = NULL) -rangayyan_eeg_bands(x, fs, bands = NULL, nperseg = NULL) +morie_rangayyan_eeg_bands(x, fs, bands = NULL, nperseg = NULL) } \arguments{ \item{x}{Numeric EEG vector.} @@ -32,10 +32,12 @@ gamma 30-100 Hz. } \examples{ \donttest{ - set.seed(0); fs <- 256 - t <- seq(0, 8, length.out = 2048) - x <- sin(2 * pi * 10 * t) + 0.3 * rnorm(length(t)) - r <- rgeeg(x, fs = fs); r$relative[["alpha"]] > r$relative[["gamma"]] +set.seed(0) +fs <- 256 +t <- seq(0, 8, length.out = 2048) +x <- sin(2 * pi * 10 * t) + 0.3 * rnorm(length(t)) +r <- rgeeg(x, fs = fs) +r$relative[["alpha"]] > r$relative[["gamma"]] } } \references{ diff --git a/r-package/morie/man/rgemg.Rd b/r-package/morie/man/rgemg.Rd index 523cee03c4..099c76f0b0 100644 --- a/r-package/morie/man/rgemg.Rd +++ b/r-package/morie/man/rgemg.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgemg.R \name{rgemg} \alias{rgemg} -\alias{rangayyan_emg_rms} +\alias{morie_rangayyan_emg_rms} \title{EMG sliding-window RMS envelope -- Rangayyan Ch 8} \usage{ rgemg(x, window = 64L, fs = 1) -rangayyan_emg_rms(x, window = 64L, fs = 1) +morie_rangayyan_emg_rms(x, window = 64L, fs = 1) } \arguments{ \item{x}{Numeric vector.} @@ -23,7 +23,9 @@ Named list \code{rms}, \code{window}, \code{fs}, \code{mean_rms}. \deqn{\mathrm{RMS}[n] = \sqrt{ (1/W) \sum_{k=n-W+1}^{n} x[k]^2 }} } \examples{ -set.seed(0); r <- rgemg(rnorm(500), window = 32); length(r$rms) +set.seed(0) +r <- rgemg(rnorm(500), window = 32) +length(r$rms) } \references{ Rangayyan Ch 8. diff --git a/r-package/morie/man/rgenv.Rd b/r-package/morie/man/rgenv.Rd index 88d2b55add..ad88c88455 100644 --- a/r-package/morie/man/rgenv.Rd +++ b/r-package/morie/man/rgenv.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgenv.R \name{rgenv} \alias{rgenv} -\alias{rangayyan_envelope} +\alias{morie_rangayyan_envelope} \title{Hilbert envelope -- Rangayyan Ch 5} \usage{ rgenv(x) -rangayyan_envelope(x) +morie_rangayyan_envelope(x) } \arguments{ \item{x}{Numeric vector.} @@ -28,7 +28,8 @@ signal directly from the FFT. \examples{ t <- seq(0, 1, length.out = 200) x <- cos(2 * pi * 5 * t) * (1 + 0.3 * cos(2 * pi * 0.5 * t)) -r <- rgenv(x); length(r$envelope) +r <- rgenv(x) +length(r$envelope) } \references{ Rangayyan Ch 5. diff --git a/r-package/morie/man/rgfir.Rd b/r-package/morie/man/rgfir.Rd index 35d9d22383..d37f8eaaa3 100644 --- a/r-package/morie/man/rgfir.Rd +++ b/r-package/morie/man/rgfir.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgfir.R \name{rgfir} \alias{rgfir} -\alias{rangayyan_fir_filter} +\alias{morie_rangayyan_fir_filter} \title{FIR lowpass filter (windowed sinc) -- Rangayyan Ch 3} \usage{ rgfir(x, cutoff, order = 51L, fs = 1, window = "hamming") -rangayyan_fir_filter(x, cutoff, order = 51L, fs = 1, window = "hamming") +morie_rangayyan_fir_filter(x, cutoff, order = 51L, fs = 1, window = "hamming") } \arguments{ \item{x}{Numeric vector. Input signal.} diff --git a/r-package/morie/man/rghfd.Rd b/r-package/morie/man/rghfd.Rd index 01d022f6cc..e1f5eca6f9 100644 --- a/r-package/morie/man/rghfd.Rd +++ b/r-package/morie/man/rghfd.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rghfd.R \name{rghfd} \alias{rghfd} -\alias{rangayyan_higuchi_fd} +\alias{morie_rangayyan_higuchi_fd} \title{Higuchi fractal dimension -- Rangayyan Ch 7} \usage{ rghfd(x, kmax = 10L) -rangayyan_higuchi_fd(x, kmax = 10L) +morie_rangayyan_higuchi_fd(x, kmax = 10L) } \arguments{ \item{x}{Numeric vector.} @@ -21,7 +21,8 @@ Named list \code{HFD}, \code{log_L}, \code{log_inv_k}, \code{kmax}. Higuchi (1988) fractal dimension via curve-length scaling. } \examples{ -set.seed(0); rghfd(rnorm(500), kmax = 8)$HFD +set.seed(0) +rghfd(rnorm(500), kmax = 8)$HFD } \references{ Higuchi (1988), Physica D 31:277. Rangayyan Ch 7. diff --git a/r-package/morie/man/rghrv.Rd b/r-package/morie/man/rghrv.Rd index 86259745da..59870ec1ac 100644 --- a/r-package/morie/man/rghrv.Rd +++ b/r-package/morie/man/rghrv.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rghrv.R \name{rghrv} \alias{rghrv} -\alias{rangayyan_hrv} +\alias{morie_rangayyan_hrv} \title{Heart rate variability (time-domain) -- Rangayyan Ch 6} \usage{ rghrv(rr_ms) -rangayyan_hrv(rr_ms) +morie_rangayyan_hrv(rr_ms) } \arguments{ \item{rr_ms}{Numeric vector of NN intervals in milliseconds.} @@ -29,7 +29,9 @@ sequence of consecutive NN (RR) intervals. } } \examples{ -set.seed(0); rgh <- rghrv(800 + rnorm(200, sd = 40)); rgh$heart_rate_bpm +set.seed(0) +rgh <- rghrv(800 + rnorm(200, sd = 40)) +rgh$heart_rate_bpm } \references{ Task Force (1996), Circulation 93:1043. Rangayyan Ch 6. diff --git a/r-package/morie/man/rgiir.Rd b/r-package/morie/man/rgiir.Rd index d031f8b3a8..ceca6e8486 100644 --- a/r-package/morie/man/rgiir.Rd +++ b/r-package/morie/man/rgiir.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgiir.R \name{rgiir} \alias{rgiir} -\alias{rangayyan_iir_filter} +\alias{morie_rangayyan_iir_filter} \title{Butterworth IIR filter -- Rangayyan Ch 3} \usage{ rgiir(x, cutoff, order = 4L, fs = 1, btype = c("low", "high", "pass", "stop")) -rangayyan_iir_filter( +morie_rangayyan_iir_filter( x, cutoff, order = 4L, diff --git a/r-package/morie/man/rglyp.Rd b/r-package/morie/man/rglyp.Rd index 24988be272..00aa4a8ac6 100644 --- a/r-package/morie/man/rglyp.Rd +++ b/r-package/morie/man/rglyp.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rglyp.R \name{rglyp} \alias{rglyp} -\alias{rangayyan_lyapunov} +\alias{morie_rangayyan_lyapunov} \title{Largest Lyapunov exponent (Rosenstein) -- Rangayyan Ch 7} \usage{ rglyp(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) -rangayyan_lyapunov(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) +morie_rangayyan_lyapunov(x, m = 3L, tau = 1L, max_t = NULL, theiler = 10L) } \arguments{ \item{x}{Numeric vector.} @@ -28,7 +28,8 @@ Rosenstein et al. (1993) algorithm via delay embedding and nearest-neighbour divergence tracking. } \examples{ -set.seed(0); rglyp(rnorm(200), m = 3, tau = 1, max_t = 20)$lyapunov +set.seed(0) +rglyp(rnorm(200), m = 3, tau = 1, max_t = 20)$lyapunov } \references{ Rosenstein et al. (1993), Physica D 65:117. diff --git a/r-package/morie/man/rgpsd.Rd b/r-package/morie/man/rgpsd.Rd index b90199c0c8..73daf5c953 100644 --- a/r-package/morie/man/rgpsd.Rd +++ b/r-package/morie/man/rgpsd.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgpsd.R \name{rgpsd} \alias{rgpsd} -\alias{rangayyan_psd} +\alias{morie_rangayyan_psd} \title{Welch power spectral density -- Rangayyan Ch 4} \usage{ rgpsd(x, fs = 1, nperseg = NULL, window = "hann") -rangayyan_psd(x, fs = 1, nperseg = NULL, window = "hann") +morie_rangayyan_psd(x, fs = 1, nperseg = NULL, window = "hann") } \arguments{ \item{x}{Numeric vector.} @@ -29,7 +29,8 @@ agree to ~1e-3 for typical EEG/ECG inputs. } \examples{ \donttest{ -set.seed(0); fs <- 100 +set.seed(0) +fs <- 100 t <- seq(0, 10, length.out = 1000) x <- sin(2 * pi * 10 * t) r <- rgpsd(x, fs = fs, nperseg = 256) diff --git a/r-package/morie/man/rgqrs.Rd b/r-package/morie/man/rgqrs.Rd index 9f1b184552..829853470a 100644 --- a/r-package/morie/man/rgqrs.Rd +++ b/r-package/morie/man/rgqrs.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgqrs.R \name{rgqrs} \alias{rgqrs} -\alias{rangayyan_qrs_detect} +\alias{morie_rangayyan_qrs_detect} \title{Pan-Tompkins QRS detector -- Rangayyan Ch 6} \usage{ rgqrs(x, fs = 360) -rangayyan_qrs_detect(x, fs = 360) +morie_rangayyan_qrs_detect(x, fs = 360) } \arguments{ \item{x}{Numeric ECG vector.} @@ -25,12 +25,15 @@ of integrated max, 200-ms refractory). } \examples{ \donttest{ - if (requireNamespace("signal", quietly = TRUE)) { - fs <- 360; t <- seq(0, 5, length.out = 5 * fs) - ecg <- rowSums(sapply(seq(0.5, 4.5, by = 1.0), - function(tk) exp(-((t - tk) * 30)^2))) - rgqrs(ecg, fs = fs)$r_peaks - } +if (requireNamespace("signal", quietly = TRUE)) { + fs <- 360 + t <- seq(0, 5, length.out = 5 * fs) + ecg <- rowSums(sapply( + seq(0.5, 4.5, by = 1.0), + function(tk) exp(-((t - tk) * 30)^2) + )) + rgqrs(ecg, fs = fs)$r_peaks +} } } \references{ diff --git a/r-package/morie/man/rgsam.Rd b/r-package/morie/man/rgsam.Rd index 02b7c3fac8..b2c3e26e0b 100644 --- a/r-package/morie/man/rgsam.Rd +++ b/r-package/morie/man/rgsam.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgsam.R \name{rgsam} \alias{rgsam} -\alias{rangayyan_sample_entropy} +\alias{morie_rangayyan_sample_entropy} \title{Sample entropy -- Rangayyan Ch 7} \usage{ rgsam(x, m = 2L, r = NULL) -rangayyan_sample_entropy(x, m = 2L, r = NULL) +morie_rangayyan_sample_entropy(x, m = 2L, r = NULL) } \arguments{ \item{x}{Numeric vector.} @@ -30,7 +30,8 @@ where \code{B} is the number of unordered template-vector pairs of length \code{m+1} (self-matches excluded). } \examples{ -set.seed(0); rgsam(rnorm(100), m = 2)$SampEn +set.seed(0) +rgsam(rnorm(100), m = 2)$SampEn } \references{ Richman & Moorman (2000), AJP Heart 278:H2039. diff --git a/r-package/morie/man/rgstf.Rd b/r-package/morie/man/rgstf.Rd index ebf2ab1da4..1c7ffd1a42 100644 --- a/r-package/morie/man/rgstf.Rd +++ b/r-package/morie/man/rgstf.Rd @@ -2,12 +2,18 @@ % Please edit documentation in R/rgstf.R \name{rgstf} \alias{rgstf} -\alias{rangayyan_stft} +\alias{morie_rangayyan_stft} \title{Short-time Fourier transform -- Rangayyan Ch 4} \usage{ rgstf(x, fs = 1, nperseg = 256L, noverlap = NULL, window = "hann") -rangayyan_stft(x, fs = 1, nperseg = 256L, noverlap = NULL, window = "hann") +morie_rangayyan_stft( + x, + fs = 1, + nperseg = 256L, + noverlap = NULL, + window = "hann" +) } \arguments{ \item{x}{Numeric vector.} @@ -30,8 +36,10 @@ Sliding-window FFT spectrogram (one-sided, PSD-scaled). Mirrors } \examples{ \donttest{ - t <- seq(0, 10, length.out = 1024); x <- sin(2 * pi * 10 * t) - r <- rgstf(x, fs = 100, nperseg = 128); dim(r$Sxx) +t <- seq(0, 10, length.out = 1024) +x <- sin(2 * pi * 10 * t) +r <- rgstf(x, fs = 100, nperseg = 128) +dim(r$Sxx) } } \references{ diff --git a/r-package/morie/man/rgwav.Rd b/r-package/morie/man/rgwav.Rd index 92f22c3678..daafc535be 100644 --- a/r-package/morie/man/rgwav.Rd +++ b/r-package/morie/man/rgwav.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgwav.R \name{rgwav} \alias{rgwav} -\alias{rangayyan_wavelet_denoise} +\alias{morie_rangayyan_wavelet_denoise} \title{Wavelet denoising (universal threshold) -- Rangayyan Ch 10} \usage{ rgwav(x, wavelet = "d8", level = NULL, mode = c("soft", "hard")) -rangayyan_wavelet_denoise( +morie_rangayyan_wavelet_denoise( x, wavelet = "d8", level = NULL, @@ -30,7 +30,7 @@ Named list \code{signal}, \code{threshold}, \code{sigma}, \code{wavelet}, \description{ Donoho-Johnstone soft/hard thresholding via the \code{wavelets} package (Daubechies DWT). Noise scale estimated from the finest-scale detail -as sigma = MAD / 0.6745; universal threshold T = sigma sqrt(2 ln N). +as sigma = MAD / 0.6745; universal threshold thr = sigma sqrt(2 ln N). } \details{ Falls back to a 5-tap moving-average smoother (with a warning) if @@ -39,11 +39,13 @@ signals. } \examples{ \donttest{ - set.seed(0); t <- seq(0, 1, length.out = 256) - x <- sin(2 * pi * 3 * t) + 0.3 * rnorm(256) - if (requireNamespace("wavelets", quietly = TRUE)) { - r <- rgwav(x, level = 3); length(r$signal) - } +set.seed(0) +t <- seq(0, 1, length.out = 256) +x <- sin(2 * pi * 3 * t) + 0.3 * rnorm(256) +if (requireNamespace("wavelets", quietly = TRUE)) { + r <- rgwav(x, level = 3) + length(r$signal) +} } } \references{ diff --git a/r-package/morie/man/rgzcr.Rd b/r-package/morie/man/rgzcr.Rd index cd3070f628..3398c4efed 100644 --- a/r-package/morie/man/rgzcr.Rd +++ b/r-package/morie/man/rgzcr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rgzcr.R \name{rgzcr} \alias{rgzcr} -\alias{rangayyan_zero_crossing} +\alias{morie_rangayyan_zero_crossing} \title{Zero-crossing rate -- Rangayyan Ch 5} \usage{ rgzcr(x, fs = 1) -rangayyan_zero_crossing(x, fs = 1) +morie_rangayyan_zero_crossing(x, fs = 1) } \arguments{ \item{x}{Numeric vector.} diff --git a/r-package/morie/man/risk_difference_ci.Rd b/r-package/morie/man/risk_difference_ci.Rd deleted file mode 100644 index e6e7ea1c3f..0000000000 --- a/r-package/morie/man/risk_difference_ci.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{risk_difference_ci} -\alias{risk_difference_ci} -\title{Risk difference (ARD) with Newcombe CI} -\description{ - Risk difference (ARD) with Newcombe CI -} -\usage{ - risk_difference_ci(table_2x2, alpha) -} -\arguments{ - \item{table_2x2}{A 2x2 matrix: rows are exposure, columns are outcome.} - \item{alpha}{Significance level.} -} -\value{ - Named list: 'rd', 'ci_lower', 'ci_upper'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/risk_ratio_ci.Rd b/r-package/morie/man/risk_ratio_ci.Rd deleted file mode 100644 index acc5f2e960..0000000000 --- a/r-package/morie/man/risk_ratio_ci.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{risk_ratio_ci} -\alias{risk_ratio_ci} -\title{Risk ratio (relative risk) with log-normal CI} -\description{ - Risk ratio (relative risk) with log-normal CI -} -\usage{ - risk_ratio_ci(table_2x2, alpha) -} -\arguments{ - \item{table_2x2}{A 2x2 matrix: rows are exposure, columns are outcome (disease = col 1).} - \item{alpha}{Significance level.} -} -\value{ - Named list: 'rr', 'ci_lower', 'ci_upper'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/rkhsc.Rd b/r-package/morie/man/rkhsc.Rd index dbedd9a071..72fe5f0977 100644 --- a/r-package/morie/man/rkhsc.Rd +++ b/r-package/morie/man/rkhsc.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/rkhsc.R \name{rkhsc} \alias{rkhsc} -\alias{rkhs_kernel_regression} +\alias{morie_rkhs_kernel_regression} \title{RKHS kernel ridge regression (Wahba 1990)} \usage{ rkhsc(x, y, sigma = NULL, lam = 0.001) -rkhs_kernel_regression(x, y, sigma = NULL, lam = 0.001) +morie_rkhs_kernel_regression(x, y, sigma = NULL, lam = 0.001) } \arguments{ \item{x}{numeric vector or matrix of predictors.} diff --git a/r-package/morie/man/run_ebac_selection_ipw_analysis.Rd b/r-package/morie/man/run_ebac_selection_ipw_analysis.Rd deleted file mode 100644 index d1ab257dc1..0000000000 --- a/r-package/morie/man/run_ebac_selection_ipw_analysis.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by morie generate_rd.py -\name{run_ebac_selection_ipw_analysis} -\alias{run_ebac_selection_ipw_analysis} -\title{Run the eBAC selection-adjusted IPW workflow} -\description{ - Run the eBAC selection-adjusted IPW workflow - -Mirrors the core outputs of the old '07_ebac_ipw.R' workflow. -} -\usage{ - run_ebac_selection_ipw_analysis(data, output_dir, treatment, covariates) -} -\arguments{ - \item{data}{Analysis data frame.} - \item{output_dir}{Optional directory for CSV outputs.} - \item{treatment}{Treatment column name.} - \item{covariates}{Covariate names used in the observation model.} -} -\value{ - Named list of output tables and the observed-domain analysis frame. -} -\examples{ -\dontrun{ - set.seed(2026) - n <- 200L - x1 <- rnorm(n); x2 <- rnorm(n); x3 <- rnorm(n) - D <- rbinom(n, 1, plogis(0.5 * x1)) - df <- data.frame(D = D, x1 = x1, x2 = x2, x3 = x3) - out <- run_ebac_selection_ipw_analysis( - data = df, - output_dir = tempdir(), - treatment = "D", - covariates = c("x1", "x2", "x3") - ) - out$selected_covariates -} -} diff --git a/r-package/morie/man/run_pipeline.Rd b/r-package/morie/man/run_pipeline.Rd deleted file mode 100644 index 9805a74cdb..0000000000 --- a/r-package/morie/man/run_pipeline.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by morie generate_rd.py -\name{run_pipeline} -\alias{run_pipeline} -\title{Run multiple workflow steps} -\description{ - Run multiple workflow steps -} -\usage{ - run_pipeline(steps, project_root, script_map, stop_on_error, verbose) -} -\arguments{ - \item{steps}{Ordered vector of workflow step names.} - \item{project_root}{Project root directory.} - \item{script_map}{Named character vector mapping steps to script paths.} - \item{stop_on_error}{If 'TRUE', stop at first failure.} - \item{verbose}{If 'TRUE', streams command output.} -} -\value{ - Data frame of step statuses. -} -\examples{ -\dontrun{ - # Runs a sequence of workflow steps from the project root. - result <- run_pipeline(steps = c("modules", "publish"), - project_root = ".", - verbose = TRUE) - result$step_results -} -} diff --git a/r-package/morie/man/run_propensity_ipw_analysis.Rd b/r-package/morie/man/run_propensity_ipw_analysis.Rd deleted file mode 100644 index a491f7af2b..0000000000 --- a/r-package/morie/man/run_propensity_ipw_analysis.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by morie generate_rd.py -\name{run_propensity_ipw_analysis} -\alias{run_propensity_ipw_analysis} -\title{Run the CPADS propensity/IPW workflow} -\description{ - Run the CPADS propensity/IPW workflow - -Mirrors the core outputs of the old '07_propensity.R' workflow. -} -\usage{ - run_propensity_ipw_analysis(data, output_dir, trim, treatment, outcome, covariates) -} -\arguments{ - \item{data}{Analysis data frame.} - \item{output_dir}{Optional directory for CSV outputs.} - \item{trim}{Quantile pair used to trim extreme IPW values.} - \item{treatment}{Binary treatment column.} - \item{outcome}{Binary outcome column.} - \item{covariates}{Covariate names for the propensity model.} -} -\value{ - Named list of output tables and the analysis data. -} -\examples{ -\dontrun{ - set.seed(2026) - n <- 200L - x <- rnorm(n) - D <- rbinom(n, 1, plogis(0.5 * x)) - y <- 0.7 * D + 0.3 * x + rnorm(n) - df <- data.frame(D = D, y = y, age = x) - out <- run_propensity_ipw_analysis( - data = df, - output_dir = tempdir(), - treatment = "D", outcome = "y", covariates = "age" - ) - out$ate -} -} diff --git a/r-package/morie/man/run_workflow_step.Rd b/r-package/morie/man/run_workflow_step.Rd deleted file mode 100644 index b50a3bb40a..0000000000 --- a/r-package/morie/man/run_workflow_step.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by morie generate_rd.py -\name{run_workflow_step} -\alias{run_workflow_step} -\title{Run one project workflow step} -\description{ - Run one project workflow step -} -\usage{ - run_workflow_step(step, project_root, script_map, rscript_bin, verbose) -} -\arguments{ - \item{step}{Step name present in 'script_map'.} - \item{project_root}{Project root directory.} - \item{script_map}{Named character vector mapping steps to script paths.} - \item{rscript_bin}{Optional path to 'Rscript' binary.} - \item{verbose}{If 'TRUE', streams command output.} -} -\value{ - Named list with step metadata and exit status. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/sample_size_logistic.Rd b/r-package/morie/man/sample_size_logistic.Rd deleted file mode 100644 index 856cb3a514..0000000000 --- a/r-package/morie/man/sample_size_logistic.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by morie generate_rd.py -\name{sample_size_logistic} -\alias{sample_size_logistic} -\title{Sample size for logistic regression (Hsieh, Bloch & Larsen 1998)} -\description{ - Sample size for logistic regression detecting a target odds ratio - -Uses the formula from Hsieh et al. (1998): -\deqn{n = \frac{(z_{\alpha/2} + z_\beta)^2}{p_1(1-p_1) [\log(OR)]^2}} -} -\usage{ - sample_size_logistic(p0, or, alpha, power, two_sided) -} -\arguments{ - \item{p0}{Prevalence under control.} - \item{or}{Target odds ratio.} - \item{alpha}{Significance level.} - \item{power}{Desired power.} - \item{two_sided}{Logical.} -} -\value{ - Integer sample size. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/sarla.Rd b/r-package/morie/man/sarla.Rd index e333144839..e83d954ded 100644 --- a/r-package/morie/man/sarla.Rd +++ b/r-package/morie/man/sarla.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sarla.R \name{sarla} \alias{sarla} -\alias{spatial_ar_lag} +\alias{morie_spatial_ar_lag} \title{Spatial autoregressive lag model (SAR lag, ML).} \usage{ sarla(x, y, w) -spatial_ar_lag(x, y, w) +morie_spatial_ar_lag(x, y, w) } \arguments{ \item{x}{Design matrix (n by p).} @@ -24,10 +24,8 @@ Y = rho W Y + X beta + eps, eps ~ N(0, sigma2 I). Concentrated log-likelihood in rho. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Anselin (1988); Schabenberger & Gotway (2005), Ch 7. diff --git a/r-package/morie/man/sarre.Rd b/r-package/morie/man/sarre.Rd index aea2934fdd..02caf672ee 100644 --- a/r-package/morie/man/sarre.Rd +++ b/r-package/morie/man/sarre.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sarre.R \name{sarre} \alias{sarre} -\alias{spatial_ar_error} +\alias{morie_spatial_ar_error} \title{Spatial autoregressive error model (SAR error, ML).} \usage{ sarre(x, y, w) -spatial_ar_error(x, y, w) +morie_spatial_ar_error(x, y, w) } \arguments{ \item{x}{Design matrix (n by p, intercept explicit).} @@ -25,10 +25,8 @@ Concentrated log-likelihood in lambda; beta via GLS on the transformed system A y = A X beta + eps, A = I - lambda W. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Anselin (1988); Schabenberger & Gotway (2005), Ch 7. diff --git a/r-package/morie/man/sensitivity_rosenbaum.Rd b/r-package/morie/man/sensitivity_rosenbaum.Rd deleted file mode 100644 index 33bbbe9c3a..0000000000 --- a/r-package/morie/man/sensitivity_rosenbaum.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by morie generate_rd.py -\name{sensitivity_rosenbaum} -\alias{sensitivity_rosenbaum} -\title{Rosenbaum sensitivity bounds for matched outcomes} -\description{ - Rosenbaum bounds sensitivity analysis - -For a range of hidden-confounding levels \eqn{\Gamma}, tests whether -the treatment effect remains significant. A large \eqn{\Gamma} at -which the result remains significant indicates robustness. - -Uses Wilcoxon signed-rank statistic bounds for matched designs. -For unmatched data, computes sign-score bounds. -} -\usage{ - sensitivity_rosenbaum(treated, control, gamma_range) -} -\arguments{ - \item{treated}{Numeric vector of outcomes for treated units.} - \item{control}{Numeric vector of outcomes for control units} - \item{gamma_range}{Numeric vector of \eqn{\Gamma} values to test.} -} -\value{ - Data frame with columns: 'gamma', 'p_lower', 'p_upper'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/sglm.Rd b/r-package/morie/man/sglm.Rd index bbd15a660a..7190be779e 100644 --- a/r-package/morie/man/sglm.Rd +++ b/r-package/morie/man/sglm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sglm.R \name{sglm} \alias{sglm} -\alias{spatial_glm} +\alias{morie_spatial_glm} \title{Spatial GLM (Gaussian-identity case via profile ML).} \usage{ sglm(x, y, coords, family = "gaussian") -spatial_glm(x, y, coords, family = "gaussian") +morie_spatial_glm(x, y, coords, family = "gaussian") } \arguments{ \item{x}{Numeric design matrix (n by p).} @@ -26,10 +26,7 @@ Y = X beta + delta + eps, delta ~ GP(0, sigma2 R_phi), R_phi exponential. Profile-likelihood ML over phi; beta via GLS. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +sglm(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Schabenberger & Gotway (2005), Ch 5. diff --git a/r-package/morie/man/shapiro_wilk_test.Rd b/r-package/morie/man/shapiro_wilk_test.Rd deleted file mode 100644 index a98ce92b00..0000000000 --- a/r-package/morie/man/shapiro_wilk_test.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{shapiro_wilk_test} -\alias{shapiro_wilk_test} -\title{Shapiro-Wilk normality test} -\description{ - Shapiro-Wilk normality test -} -\usage{ - shapiro_wilk_test(x, alpha) -} -\arguments{ - \item{x}{Numeric vector.} - \item{alpha}{Significance level for the 'is_normal' flag (default 0.05).} -} -\value{ - Named list: 'W', 'p_value', 'is_normal'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/simple_random_sample.Rd b/r-package/morie/man/simple_random_sample.Rd deleted file mode 100644 index 04c0413a37..0000000000 --- a/r-package/morie/man/simple_random_sample.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{simple_random_sample} -\alias{simple_random_sample} -\title{Simple random sample from a data frame} -\description{ - Simple random sample from a data frame -} -\usage{ - simple_random_sample(df, n, replace, seed) -} -\arguments{ - \item{df}{A data frame.} - \item{n}{Number of units to select.} - \item{replace}{Sample with replacement? Default 'FALSE'.} - \item{seed}{Random seed for reproducibility.} -} -\value{ - A data frame of 'n' sampled rows with a '.weight' column added. -} -\examples{ - df <- data.frame(x = 1:100) - srs_sample <- simple_random_sample(df, 20) -} diff --git a/r-package/morie/man/smixd.Rd b/r-package/morie/man/smixd.Rd index 7951d0d28b..2fa011c44b 100644 --- a/r-package/morie/man/smixd.Rd +++ b/r-package/morie/man/smixd.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/smixd.R \name{smixd} \alias{smixd} -\alias{spatial_mixed_model} +\alias{morie_spatial_mixed_model} \title{Spatial linear mixed model via REML.} \usage{ smixd(x, y, coords) -spatial_mixed_model(x, y, coords) +morie_spatial_mixed_model(x, y, coords) } \arguments{ \item{x}{Numeric design matrix (n by p).} @@ -25,10 +25,7 @@ delta ~ N(0, sigma2 R_phi), R_phi exponential, eps ~ N(0, tau2 I). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +smixd(x = rnorm(50), y = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Patterson & Thompson (1971); Schabenberger & Gotway (2005), Ch 5. diff --git a/r-package/morie/man/sobls.Rd b/r-package/morie/man/sobls.Rd index 61e4c5a34a..2c48e0983c 100644 --- a/r-package/morie/man/sobls.Rd +++ b/r-package/morie/man/sobls.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sobls.R \name{sobls} \alias{sobls} -\alias{sobol_sequence} +\alias{morie_sobol_sequence} \title{Sobol quasi-random sequence (Sobol 1967)} \usage{ sobls(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) -sobol_sequence(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) +morie_sobol_sequence(N = 128L, d = 1L, f = NULL, scramble = TRUE, seed = 42L) } \arguments{ \item{N}{integer; default 128.} diff --git a/r-package/morie/man/spblk.Rd b/r-package/morie/man/spblk.Rd index a420665ec8..1bacf465c3 100644 --- a/r-package/morie/man/spblk.Rd +++ b/r-package/morie/man/spblk.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/spblk.R \name{spblk} \alias{spblk} -\alias{spatial_block_kriging} +\alias{morie_spatial_block_kriging} \title{Ordinary block kriging.} \usage{ spblk(x, coords, blocks, n_quad = 25, nugget = 0, sill = 1, range_ = 1) -spatial_block_kriging( +morie_spatial_block_kriging( x, coords, blocks, @@ -38,10 +38,8 @@ Solve the OK system with point-to-block averaged covariances: \deqn{C_{\text{bar}}(s_i, B) = (1/|B|) \int_B C(s_i, u) du.} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/spcrs.Rd b/r-package/morie/man/spcrs.Rd index 573f38762d..918916a78e 100644 --- a/r-package/morie/man/spcrs.Rd +++ b/r-package/morie/man/spcrs.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/spcrs.R \name{spcrs} \alias{spcrs} -\alias{spatial_cross_validation} +\alias{morie_spatial_cross_validation} \title{Leave-one-out cross-validation for ordinary kriging.} \usage{ spcrs(x, coords, nugget = 0, sill = 1, range_ = 1) -spatial_cross_validation(x, coords, nugget = 0, sill = 1, range_ = 1) +morie_spatial_cross_validation(x, coords, nugget = 0, sill = 1, range_ = 1) } \arguments{ \item{x}{Numeric vector.} @@ -23,10 +23,7 @@ Named list: estimate (MSPE, RMSPE, MAE, residuals), n, method. MSPE = (1/n) sum (Z(s_i) - Z_hat_minus_i(s_i))^2. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +spcrs(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/spearman_rho.Rd b/r-package/morie/man/spearman_rho.Rd deleted file mode 100644 index a031ad152f..0000000000 --- a/r-package/morie/man/spearman_rho.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{spearman_rho} -\alias{spearman_rho} -\title{Spearman rank correlation} -\description{ - Spearman rank correlation -} -\usage{ - spearman_rho(x, y) -} -\arguments{ - \item{x}{Numeric vector.} - \item{y}{Numeric vector.} -} -\value{ - Named list: 'rho', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/sptag.Rd b/r-package/morie/man/sptag.Rd index 01f722e659..d9db6b861a 100644 --- a/r-package/morie/man/sptag.Rd +++ b/r-package/morie/man/sptag.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sptag.R \name{sptag} \alias{sptag} -\alias{spatial_agreement} +\alias{morie_spatial_agreement} \title{Pairwise vote-agreement matrix (Armstrong Ch 8)} \usage{ sptag(x) -spatial_agreement(x) +morie_spatial_agreement(x) } \arguments{ \item{x}{Vote matrix (n by m); NA = absent.} @@ -21,9 +21,6 @@ A_ij = proportion of roll calls on which legislators i, j voted the same way (excluding mutually absent items). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +sptag(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/sptau.Rd b/r-package/morie/man/sptau.Rd index c52035a058..2e552deafe 100644 --- a/r-package/morie/man/sptau.Rd +++ b/r-package/morie/man/sptau.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sptau.R \name{sptau} \alias{sptau} -\alias{spatial_autocorrelation} +\alias{morie_spatial_autocorrelation} \title{Moran's I spatial autocorrelation (Schabenberger Ch 1).} \usage{ sptau(x, w) -spatial_autocorrelation(x, w) +morie_spatial_autocorrelation(x, w) } \arguments{ \item{x}{Numeric vector of length n (observed values).} @@ -28,10 +28,8 @@ Formula: / \sum_i (x_i - \bar x)^2} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Cliff & Ord (1981). Schabenberger & Gotway (2005), Ch 1. diff --git a/r-package/morie/man/sptrn.Rd b/r-package/morie/man/sptrn.Rd index 6593c39f32..282ee8226b 100644 --- a/r-package/morie/man/sptrn.Rd +++ b/r-package/morie/man/sptrn.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/sptrn.R \name{sptrn} \alias{sptrn} -\alias{spatial_trend_surface} +\alias{morie_spatial_trend_surface} \title{Polynomial trend surface analysis (OLS).} \usage{ sptrn(x, coords, order = 2) -spatial_trend_surface(x, coords, order = 2) +morie_spatial_trend_surface(x, coords, order = 2) } \arguments{ \item{x}{Numeric vector.} @@ -23,10 +23,7 @@ Named list: estimate, se, r2, order, n, method. mu(s) = sum_k beta_k f_k(s); f_k are monomials up to degree \code{order}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +sptrn(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Schabenberger & Gotway (2005), Ch 2. diff --git a/r-package/morie/man/stacv.Rd b/r-package/morie/man/stacv.Rd index e38b44e952..0543273b65 100644 --- a/r-package/morie/man/stacv.Rd +++ b/r-package/morie/man/stacv.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/stacv.R \name{stacv} \alias{stacv} -\alias{spatiotemporal_autocovariance} +\alias{morie_spatiotemporal_autocovariance} \title{Empirical spatiotemporal autocovariance.} \usage{ stacv( @@ -15,7 +15,7 @@ stacv( max_temporal = NULL ) -spatiotemporal_autocovariance( +morie_spatiotemporal_autocovariance( x, coords, times, @@ -45,10 +45,7 @@ n, method. \sum_{(i,j) \in N(h,u)} (Z_i - \bar Z)(Z_j - \bar Z)}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +stacv(x = rnorm(50), coords = matrix(runif(100), 50, 2), times = sort(cumsum(rexp(50)))) } \references{ Cressie & Huang (1999); Schabenberger & Gotway (2005), Ch 8. diff --git a/r-package/morie/man/stkrg.Rd b/r-package/morie/man/stkrg.Rd index 86a48e3ef9..fe76297116 100644 --- a/r-package/morie/man/stkrg.Rd +++ b/r-package/morie/man/stkrg.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/stkrg.R \name{stkrg} \alias{stkrg} -\alias{spatiotemporal_kriging} +\alias{morie_spatiotemporal_kriging} \title{Spatiotemporal ordinary kriging with separable exponential covariance.} \usage{ stkrg(x, coords, times, target, sill = 1, nugget = 0, range_s = 1, range_t = 1) -spatiotemporal_kriging( +morie_spatiotemporal_kriging( x, coords, times, @@ -37,10 +37,8 @@ C((h, u)) = (sill - nugget) * exp(-h/range_s) * exp(-|u|/range_t) + nugget if (h, u) == (0, 0). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \references{ Schabenberger & Gotway (2005), Ch 8. diff --git a/r-package/morie/man/strat.Rd b/r-package/morie/man/strat.Rd index 4ce8c3d158..18ffcce897 100644 --- a/r-package/morie/man/strat.Rd +++ b/r-package/morie/man/strat.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/strat.R \name{strat} \alias{strat} -\alias{stratified_sampling} +\alias{morie_stratified_sampling} \title{Stratified mean estimator (Cochran 1977, Sampling Techniques, Ch. 5)} \usage{ strat(data, y = "y", strata = "stratum", pop_sizes = NULL) -stratified_sampling(data, y = "y", strata = "stratum", pop_sizes = NULL) +morie_stratified_sampling(data, y = "y", strata = "stratum", pop_sizes = NULL) } \arguments{ \item{data}{data.frame containing outcome and stratum columns.} diff --git a/r-package/morie/man/stratified_sample.Rd b/r-package/morie/man/stratified_sample.Rd deleted file mode 100644 index e9bfc63cd6..0000000000 --- a/r-package/morie/man/stratified_sample.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{stratified_sample} -\alias{stratified_sample} -\title{Stratified random sample from a data frame} -\description{ - Proportional or fixed stratified random sample -} -\usage{ - stratified_sample(df, strata_col, n_per_stratum, proportional, seed) -} -\arguments{ - \item{df}{A data frame.} - \item{strata_col}{Name of the stratification column.} - \item{n_per_stratum}{Either an integer (equal allocation) or a named integer} - \item{proportional}{Logical; if 'TRUE', allocate proportionally to strata sizes.} - \item{seed}{Random seed.} -} -\value{ - Data frame of sampled rows with a '.weight' column. -} -\examples{ - df <- data.frame(g = c(rep("A", 60), rep("B", 40)), x = rnorm(100)) - stratified_sample(df, "g", n_per_stratum = 10) -} diff --git a/r-package/morie/man/stvar.Rd b/r-package/morie/man/stvar.Rd index 038fb85bb8..cff71e5cdd 100644 --- a/r-package/morie/man/stvar.Rd +++ b/r-package/morie/man/stvar.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/stvar.R \name{stvar} \alias{stvar} -\alias{spatiotemporal_variogram} +\alias{morie_spatiotemporal_variogram} \title{Empirical spatiotemporal semivariogram gamma(h, u).} \usage{ stvar( @@ -15,7 +15,7 @@ stvar( max_temporal = NULL ) -spatiotemporal_variogram( +morie_spatiotemporal_variogram( x, coords, times, @@ -45,10 +45,7 @@ counts), n, method. \sum_{(i,j)\in N(h,u)} (Z(s_i, t_i) - Z(s_j, t_j))^2}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +stvar(x = rnorm(50), coords = matrix(runif(100), 50, 2), times = sort(cumsum(rexp(50)))) } \references{ Cressie & Huang (1999); Schabenberger & Gotway (2005), Ch 8. diff --git a/r-package/morie/man/substance_categories.Rd b/r-package/morie/man/substance_categories.Rd index 7a7c3c1389..b901b9d8ec 100644 --- a/r-package/morie/man/substance_categories.Rd +++ b/r-package/morie/man/substance_categories.Rd @@ -1,23 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} \name{substance_categories} \alias{substance_categories} -\docType{data} -\title{Substance category lookup table} -\description{ - Mapping from raw substance / drug-class strings (as they appear in - the source datasets) to the canonical category labels used in - \pkg{morie} analyses. +\title{Substance Categories} +\format{ +A data.frame with columns: +\describe{ +\item{key}{Short key (e.g., \code{"alcohol"}, \code{"cannabis"})} +\item{label}{Display label (e.g., \code{"Alcohol"}, \code{"Cannabis"})} +\item{source_file}{Filename in healthinfobase/CSUS/ directory} +} +} +\source{ +Canadian Substance Use Survey (CSUS) via Health Infobase Canada. } \usage{ - data(substance_categories) +substance_categories } -\format{ - A data frame with two columns: \code{raw} (the source string) and - \code{category} (the canonical label). +\description{ +Canonical substance category mapping used across CSUS HealthInfobase +data files. Maps short keys to human-readable labels and source filenames. } -\keyword{datasets} \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +data(substance_categories) +substance_categories$label } +\keyword{datasets} diff --git a/r-package/morie/man/summarize_output_audit.Rd b/r-package/morie/man/summarize_output_audit.Rd deleted file mode 100644 index 8b8ba0b4d6..0000000000 --- a/r-package/morie/man/summarize_output_audit.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{summarize_output_audit} -\alias{summarize_output_audit} -\title{Summarize an output audit} -\description{ - Summarize an output audit -} -\usage{ - summarize_output_audit(audit_tbl) -} -\arguments{ - \item{audit_tbl}{Result from [audit_public_outputs()].} -} -\value{ - Named list with high-level diagnostics. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/temperature_scaling.Rd b/r-package/morie/man/temperature_scaling.Rd index 80f24bfd1c..669f301aa4 100644 --- a/r-package/morie/man/temperature_scaling.Rd +++ b/r-package/morie/man/temperature_scaling.Rd @@ -4,15 +4,15 @@ \alias{temperature_scaling} \title{Temperature-scaled softmax (Hinton 2015)} \usage{ -temperature_scaling(x, T = 1) +temperature_scaling(x, temperature = 1) } \arguments{ \item{x}{Numeric vector of logits.} -\item{T}{Numeric softmax temperature > 0 (default 1).} +\item{temperature}{Numeric softmax temperature > 0 (default 1).} } \value{ -Named list with tensor, entropy, T, method. +Named list with tensor, entropy, temperature, method. } \description{ Temperature-scaled softmax (Hinton 2015) diff --git a/r-package/morie/man/top_k_decoding.Rd b/r-package/morie/man/top_k_decoding.Rd index 81ace80406..f942d8762f 100644 --- a/r-package/morie/man/top_k_decoding.Rd +++ b/r-package/morie/man/top_k_decoding.Rd @@ -4,14 +4,14 @@ \alias{top_k_decoding} \title{Top-k filtered softmax (Fan 2018)} \usage{ -top_k_decoding(x, k = 5L, T = 1) +top_k_decoding(x, k = 5L, temperature = 1) } \arguments{ \item{x}{Numeric vector of logits.} \item{k}{Integer truncation rank (default 5).} -\item{T}{Numeric softmax temperature (default 1).} +\item{temperature}{Numeric softmax temperature (default 1).} } \value{ Named list with tensor, topk_indices, topk_logits, k, method. diff --git a/r-package/morie/man/top_p_nucleus.Rd b/r-package/morie/man/top_p_nucleus.Rd index e5b57032ee..e74b289457 100644 --- a/r-package/morie/man/top_p_nucleus.Rd +++ b/r-package/morie/man/top_p_nucleus.Rd @@ -4,14 +4,14 @@ \alias{top_p_nucleus} \title{Top-p nucleus sampling (Holtzman 2020)} \usage{ -top_p_nucleus(x, p = 0.9, T = 1) +top_p_nucleus(x, p = 0.9, temperature = 1) } \arguments{ \item{x}{Numeric vector of logits.} \item{p}{Numeric nucleus mass cutoff in (0, 1] (default 0.9).} -\item{T}{Numeric softmax temperature (default 1).} +\item{temperature}{Numeric softmax temperature (default 1).} } \value{ Named list with tensor, keep_mask, n_kept, p, method. diff --git a/r-package/morie/man/tpspn.Rd b/r-package/morie/man/tpspn.Rd index 29c46313ac..672e01f7e4 100644 --- a/r-package/morie/man/tpspn.Rd +++ b/r-package/morie/man/tpspn.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/tpspn.R \name{tpspn} \alias{tpspn} -\alias{thin_plate_spline} +\alias{morie_thin_plate_spline} \title{Thin-plate spline regression (Duchon 1977)} \usage{ tpspn(x, y, lam = 0) -thin_plate_spline(x, y, lam = 0) +morie_thin_plate_spline(x, y, lam = 0) } \arguments{ \item{x}{numeric vector or matrix of predictors.} diff --git a/r-package/morie/man/two_sample_t_test.Rd b/r-package/morie/man/two_sample_t_test.Rd deleted file mode 100644 index 5150e71858..0000000000 --- a/r-package/morie/man/two_sample_t_test.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by morie generate_rd.py -\name{two_sample_t_test} -\alias{two_sample_t_test} -\title{Two-sample t-test with tidy output} -\description{ - Two-sample t-test with tidy output -} -\usage{ - two_sample_t_test(x1, x2, equal_var, alternative) -} -\arguments{ - \item{x1}{Numeric vector (group 1).} - \item{x2}{Numeric vector (group 2).} - \item{equal_var}{Assume equal variances? Default 'FALSE' (Welch test).} - \item{alternative}{'"two.sided"', '"greater"', or '"less"'.} -} -\value{ - Named list: 't', 'df', 'p_value', 'ci_diff', 'cohens_d'. -} -\examples{ - two_sample_t_test(rnorm(50, 0.5), rnorm(50, 0)) -} diff --git a/r-package/morie/man/ukrig.Rd b/r-package/morie/man/ukrig.Rd index 62acc5d862..33a14318b2 100644 --- a/r-package/morie/man/ukrig.Rd +++ b/r-package/morie/man/ukrig.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ukrig.R \name{ukrig} \alias{ukrig} -\alias{universal_kriging} +\alias{morie_universal_kriging} \title{Universal kriging with polynomial trend.} \usage{ ukrig( @@ -16,7 +16,7 @@ ukrig( trend_order = 1 ) -universal_kriging( +morie_universal_kriging( x, coords, target, @@ -47,10 +47,7 @@ Named list: estimate, se, n, method. Z(s) = mu(s) + delta(s), mu(s) = sum_k beta_k f_k(s). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +ukrig(x = rnorm(50), coords = matrix(runif(100), 50, 2), target = rnorm(50)) } \references{ Schabenberger & Gotway (2005), Ch 4. diff --git a/r-package/morie/man/unfdl.Rd b/r-package/morie/man/unfdl.Rd index 6971d68448..7dd3fbc0cc 100644 --- a/r-package/morie/man/unfdl.Rd +++ b/r-package/morie/man/unfdl.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/unfdl.R \name{unfdl} \alias{unfdl} -\alias{unfolding_analysis} +\alias{morie_unfolding_analysis} \title{Metric unfolding for preference data (Schoenemann; Armstrong Ch 7)} \usage{ unfdl(x, k = 2L, n_iter = 100L, tol = 1e-06) -unfolding_analysis(x, k = 2L, n_iter = 100L, tol = 1e-06) +morie_unfolding_analysis(x, k = 2L, n_iter = 100L, tol = 1e-06) } \arguments{ \item{x}{Preference dissimilarity matrix Delta (n_resp by n_stim).} @@ -27,9 +27,7 @@ Closed-form Schoenemann (1970) unfolding via SVD of the doubly centred squared-distance matrix, followed by SMACOF-lite refinement. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \keyword{internal} diff --git a/r-package/morie/man/validate_cpads_data.Rd b/r-package/morie/man/validate_cpads_data.Rd deleted file mode 100644 index ff5f07bede..0000000000 --- a/r-package/morie/man/validate_cpads_data.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{validate_cpads_data} -\alias{validate_cpads_data} -\title{Validate a CPADS analysis data frame} -\description{ - Validate a CPADS analysis data frame -} -\usage{ - validate_cpads_data(data, strict) -} -\arguments{ - \item{data}{Data frame to validate.} - \item{strict}{If 'TRUE', stop when required variables are missing.} -} -\value{ - Character vector of missing variable names. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/validate_outputs_manifest.Rd b/r-package/morie/man/validate_outputs_manifest.Rd deleted file mode 100644 index 0e7e910c37..0000000000 --- a/r-package/morie/man/validate_outputs_manifest.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by morie generate_rd.py -\name{validate_outputs_manifest} -\alias{validate_outputs_manifest} -\title{Validate outputs manifest structure} -\description{ - Validate outputs manifest structure -} -\usage{ - validate_outputs_manifest(manifest, strict) -} -\arguments{ - \item{manifest}{Data frame to validate.} - \item{strict}{If 'TRUE', stop on validation failures.} -} -\value{ - 'TRUE' when validation passes. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/vines.Rd b/r-package/morie/man/vines.Rd index 39fead1552..edb855d749 100644 --- a/r-package/morie/man/vines.Rd +++ b/r-package/morie/man/vines.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/vines.R \name{vines} \alias{vines} -\alias{vine_copula} +\alias{morie_vine_copula} \title{Gaussian D-vine pair-copula construction (Aas, Czado, Frigessi & Bakken 2009)} \usage{ vines(x) -vine_copula(x) +morie_vine_copula(x) } \arguments{ \item{x}{matrix (n x d) of continuous variables.} diff --git a/r-package/morie/man/vrgft.Rd b/r-package/morie/man/vrgft.Rd index f39399fb4f..0b6f2e275a 100644 --- a/r-package/morie/man/vrgft.Rd +++ b/r-package/morie/man/vrgft.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/vrgft.R \name{vrgft} \alias{vrgft} -\alias{variogram_fitting} +\alias{morie_variogram_fitting} \title{Variogram model fit by weighted least squares.} \usage{ vrgft(x, coords, model = "exponential", n_bins = 10, max_dist = NULL) -variogram_fitting( +morie_variogram_fitting( x, coords, model = "exponential", @@ -35,10 +35,7 @@ Models: exponential, gaussian, spherical. Exponential form: \deqn{\gamma(h) = c_0 + c_1(1 - e^{-h/a})}. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +vrgft(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Cressie (1985); Schabenberger & Gotway (2005), Ch 3. diff --git a/r-package/morie/man/vrgm.Rd b/r-package/morie/man/vrgm.Rd index 25dcd5567c..ad7cb98953 100644 --- a/r-package/morie/man/vrgm.Rd +++ b/r-package/morie/man/vrgm.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/vrgm.R \name{vrgm} \alias{vrgm} -\alias{variogram_estimation} +\alias{morie_variogram_estimation} \title{Empirical (Matheron) variogram estimation.} \usage{ vrgm(x, coords, n_bins = 10, max_dist = NULL) -variogram_estimation(x, coords, n_bins = 10, max_dist = NULL) +morie_variogram_estimation(x, coords, n_bins = 10, max_dist = NULL) } \arguments{ \item{x}{Numeric vector, length n.} @@ -27,10 +27,7 @@ n, method. \sum_{(i,j) \in N(h)} (Z(s_i) - Z(s_j))^2} } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +vrgm(x = rnorm(50), coords = matrix(runif(100), 50, 2)) } \references{ Matheron (1962); Schabenberger & Gotway (2005), Ch 3. diff --git a/r-package/morie/man/vtpwr.Rd b/r-package/morie/man/vtpwr.Rd index a1f0ec3812..2aee3d03e0 100644 --- a/r-package/morie/man/vtpwr.Rd +++ b/r-package/morie/man/vtpwr.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/vtpwr.R \name{vtpwr} \alias{vtpwr} -\alias{voting_power_index} +\alias{morie_voting_power_index} \title{Banzhaf and Shapley-Shubik voting-power indices (Armstrong Ch 10)} \usage{ vtpwr(x, quota = NULL) -voting_power_index(x, quota = NULL) +morie_voting_power_index(x, quota = NULL) } \arguments{ \item{x}{Numeric weight vector w.} @@ -23,9 +23,6 @@ Exact enumeration for n <= 10; Monte Carlo for larger games. Quota defaults to strict simple majority. } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +vtpwr(x = rnorm(50)) } \keyword{internal} diff --git a/r-package/morie/man/wilcoxon_signed_rank_test.Rd b/r-package/morie/man/wilcoxon_signed_rank_test.Rd deleted file mode 100644 index 96824591fb..0000000000 --- a/r-package/morie/man/wilcoxon_signed_rank_test.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by morie generate_rd.py -\name{wilcoxon_signed_rank_test} -\alias{wilcoxon_signed_rank_test} -\title{Wilcoxon signed-rank test (paired)} -\description{ - Wilcoxon signed-rank test (paired) -} -\usage{ - wilcoxon_signed_rank_test(x1, x2, alternative) -} -\arguments{ - \item{x1}{Numeric vector (before).} - \item{x2}{Numeric vector (after).} - \item{alternative}{'"two.sided"', '"greater"', or '"less"'.} -} -\value{ - Named list: 'V', 'p_value'. -} -\examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} -} diff --git a/r-package/morie/man/wnom.Rd b/r-package/morie/man/wnom.Rd index e5c62a431a..64a2275aa1 100644 --- a/r-package/morie/man/wnom.Rd +++ b/r-package/morie/man/wnom.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/wnom.R \name{wnom} \alias{wnom} -\alias{wnominate_estimate} -\alias{wnominate} +\alias{morie_wnominate_estimate} +\alias{morie_wnominate} \title{W-NOMINATE Gaussian-utility log-likelihood (Armstrong Ch 3)} \usage{ wnom(votes, x, z_yea, z_nay, beta = 15, w = NULL) -wnominate_estimate(votes, x, z_yea, z_nay, beta = 15, w = NULL) +morie_wnominate_estimate(votes, x, z_yea, z_nay, beta = 15, w = NULL) -wnominate(votes, x, z_yea, z_nay, beta = 15, w = NULL) +morie_wnominate(votes, x, z_yea, z_nay, beta = 15, w = NULL) } \arguments{ \item{votes}{(n_leg by n_votes) matrix; 1 = yea, 0 = nay, NA = miss.} @@ -37,9 +37,7 @@ U_ijy = beta * exp(-0.5 sum_k w_k^2 (x_ik - z_yjk)^2) + epsilon P(yea) = Phi(U_yea - U_nay). } \examples{ -\dontrun{ - # See the package vignettes for usage examples: - # vignette(package = "morie") -} +# See the package vignettes for usage examples: +# vignette(package = "morie") } \keyword{internal} diff --git a/r-package/morie/man/write_synthetic_data.Rd b/r-package/morie/man/write_synthetic_data.Rd deleted file mode 100644 index c051eb6f54..0000000000 --- a/r-package/morie/man/write_synthetic_data.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by morie generate_rd.py -\name{write_synthetic_data} -\alias{write_synthetic_data} -\title{Write synthetic epidemiology-style data to CSV} -\description{ - Write synthetic epidemiology-style data to CSV -} -\usage{ - write_synthetic_data(path, n, seed, special_code_rate, profile, name_map, overwrite) -} -\arguments{ - \item{path}{Output CSV path.} - \item{n}{Number of rows.} - \item{seed}{Random seed.} - \item{special_code_rate}{Proportion of survey-style missing codes.} - \item{profile}{Naming profile for output columns.} - \item{name_map}{Optional custom variable name map.} - \item{overwrite}{If 'TRUE', overwrite existing file.} -} -\value{ - Normalized output path. -} -\examples{ -tmp <- tempfile(fileext = ".csv") -path <- write_synthetic_data(path = tmp, n = 100, seed = 2026, - overwrite = TRUE) -df <- read.csv(path) -nrow(df); names(df) -file.remove(path) -} diff --git a/r-package/morie/scripts/validate_siu_parser.R b/r-package/morie/scripts/validate_siu_parser.R new file mode 100644 index 0000000000..ec368c775e --- /dev/null +++ b/r-package/morie/scripts/validate_siu_parser.R @@ -0,0 +1,221 @@ +#!/usr/bin/env Rscript +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# validate_siu_parser.R -- LOCAL DIAGNOSTIC for the C/C++ SIU parser. +# Not shipped to users; not the package's validation methodology. +# The package's validation contract is: every emitted row is +# reproducible from its cached HTML via the C++ parser (see +# morie_siu_audit_case()). This script is just a fast diagnostic for +# maintainers who want to spot-check changes against locally +# available reference files. +# +# Tier 1 (Schema): emitted columns match the reference SIU.csv header +# exactly (order + names). Required to pass. +# +# Tier 2 (Replication parity): for a random sample of drids that the +# prior parser version's SIU.csv produced rows for, re-fetch with +# the current transport and re-parse. Report per-field exact-match +# rate. Useful for catching unintended behavioural changes between +# parser versions; not a correctness claim. +# +# Tier 3 (External agreement, informational only): for the subset of +# sampled drids whose case_number also appears in a user-supplied +# external coding (e.g. a hand-coded survey), report per-field +# agreement. An external source is NOT ground truth -- the SIU +# report HTML is. Agreement with an external source is a sanity +# check; disagreements are not parser bugs by themselves, they +# are leads to investigate by reading the HTML. +# +# Usage: +# Rscript scripts/validate_siu_parser.R \ +# --ref-dir \ +# --sample 100 \ +# --out + +suppressPackageStartupMessages({ + library(morie) +}) + +# ---------------------------- args -------------------------------- +args <- commandArgs(trailingOnly = TRUE) +get_arg <- function(flag, default) { + i <- which(args == flag) + if (length(i) == 1L && i < length(args)) args[i + 1L] else default +} +ref_dir <- get_arg("--ref-dir", "/home/perseus/morie-work/refs") +sample_n <- as.integer(get_arg("--sample", 100L)) +out_path <- get_arg("--out", "validate-out.json") +seed <- as.integer(get_arg("--seed", 20260520L)) +rate_rps <- as.numeric(get_arg("--rate", 4.0)) + +stopifnot(dir.exists(ref_dir), + file.exists(file.path(ref_dir, "SIU.csv")), + file.exists(file.path(ref_dir, "SIU1a.xlsx"))) + +# -------------------- Tier 1: schema check ------------------------ +cat("\n=== Tier 1: schema ===\n") +ref_hdr <- readLines(file.path(ref_dir, "SIU.csv"), n = 1L) +ref_cols <- strsplit(ref_hdr, ",", fixed = TRUE)[[1]] + +# Parse our synthetic fixture to get our emitted column set in canonical +# order without hitting the network. The C++ parser ALWAYS emits the +# same 64-column structure regardless of input content. +fake_html <- paste0( + "

The Investigation

", + "

Notification: On January 1, 2024 the Toronto Police Service ", + "contacted the SIU. Director's Report for Case # 24-TFI-001.

", + "" +) +our_row <- morie:::.siu_parse_report(fake_html, 9999L, "test") +our_cols <- names(our_row) + +schema_match <- identical(ref_cols, our_cols) +cat(sprintf(" ref cols: %d\n v0.2.0 cols: %d\n exact match: %s\n", + length(ref_cols), length(our_cols), schema_match)) +if (!schema_match) { + cat(" ref - ours:", setdiff(ref_cols, our_cols), "\n") + cat(" ours - ref:", setdiff(our_cols, ref_cols), "\n") +} + +# -------- pick the sample (drids that v0.1.0 parsed cleanly) ------- +ref_csv <- utils::read.csv(file.path(ref_dir, "SIU.csv"), + colClasses = "character", check.names = FALSE) +ref_csv$drid <- suppressWarnings(as.integer(ref_csv$drid)) +ref_valid <- ref_csv[is.finite(ref_csv$drid) & nzchar(ref_csv$case_number), , + drop = FALSE] +cat(sprintf("\nReference SIU.csv: %d rows, %d with case_number\n", + nrow(ref_csv), nrow(ref_valid))) + +set.seed(seed) +sample_drids <- sort(sample(ref_valid$drid, + size = min(sample_n, nrow(ref_valid)))) +cat(sprintf("Sampling %d drids (seed=%d)\n", length(sample_drids), seed)) + +# ---- re-fetch and re-parse with v0.2.0 throttled fetcher --------- +urls <- sprintf("https://www.siu.on.ca/en/directors_report_details.php?drid=%d", + sample_drids) +cat(sprintf("\nFetching %d drids at %.1f rps ...\n", length(urls), rate_rps)) +t0 <- Sys.time() +fetch <- morie:::.siu_http_get_many_with_status( + urls, concurrency = 4L, timeout_s = 60L, + rate_rps = rate_rps, max_retries = 3L +) +elapsed <- as.numeric(difftime(Sys.time(), t0, units = "secs")) +cat(sprintf(" done in %.1fs (mean body: %.0f bytes, healthy 200s: %d/%d)\n", + elapsed, mean(nchar(fetch$body)), + sum(fetch$http_code == 200L), length(urls))) + +ours <- lapply(seq_along(sample_drids), function(i) { + if (nchar(fetch$body[i]) < 1000L) return(NULL) + morie:::.siu_parse_report(fetch$body[i], sample_drids[i], urls[i]) +}) +ok <- !vapply(ours, is.null, logical(1)) +ours_df <- as.data.frame(do.call(rbind, lapply(ours[ok], as.character)), + stringsAsFactors = FALSE) +names(ours_df) <- our_cols +ours_df$drid <- sample_drids[ok] +cat(sprintf("v0.2.0 parsed %d / %d sampled\n", sum(ok), length(sample_drids))) + +# -------------------- Tier 2: replication parity ------------------- +cat("\n=== Tier 2: replication parity (v0.2.0 vs v0.1.0) ===\n") +ref_joined <- merge(ours_df, ref_valid, by = "drid", + suffixes = c(".new", ".ref")) +parity_cols <- setdiff(our_cols, c("drid", "scraped_at_utc", + "parser_version")) +parity_rate <- vapply(parity_cols, function(col) { + a <- ref_joined[[paste0(col, ".new")]] + b <- ref_joined[[paste0(col, ".ref")]] + if (is.null(a) || is.null(b)) return(NA_real_) + mean(trimws(a) == trimws(b), na.rm = TRUE) +}, numeric(1)) +parity_overall <- mean(parity_rate, na.rm = TRUE) +cat(sprintf(" overall per-field exact-match: %.2f%%\n", + 100 * parity_overall)) +cat(" worst 8 fields:\n") +worst <- sort(parity_rate)[1:8] +for (i in seq_along(worst)) { + cat(sprintf(" %-40s %.1f%%\n", names(worst)[i], 100 * worst[i])) +} + +# -------------------- Tier 3: human ground truth ------------------ +cat("\n=== Tier 3: correctness vs SIU1a.xlsx (Qualtrics-coded) ===\n") +if (requireNamespace("readxl", quietly = TRUE)) { + siu1a <- readxl::read_excel(file.path(ref_dir, "SIU1a.xlsx")) + siu1a <- as.data.frame(siu1a) + # First row is the question prose; drop it. + if (any(grepl("Start Date", siu1a[1, ], fixed = TRUE))) { + siu1a <- siu1a[-1L, , drop = FALSE] + } + # Q-col -> morie-field mapping (from SIU_diff_vs_1a.jsonl) + q_map <- list( + Q1 = "case_number", + Q3 = "police_service", + Q4 = "number_of_officers_involved", + Q5 = "location_of_call", + Q9 = "number_of_affected_persons", + Q10 = "sex_gender_affected", + Q11 = "age_affected", + Q14 = "number_of_civilian_witnesses", + Q16 = "number_of_subject_officials", + Q19 = "number_of_witness_officials", + Q26 = "charges_recommended" + ) + siu1a$case_number <- as.character(siu1a$Q1) + joined <- merge(ours_df, siu1a, by = "case_number") + cat(sprintf(" cases in BOTH our parse + SIU1a Qualtrics: %d\n", + nrow(joined))) + + normalise_val <- function(v) { + v <- trimws(as.character(v)) + v <- gsub("\\.0+$", "", v) # 2.0 -> 2 + v <- tolower(v) + v[v %in% c("na", "n/a", "")] <- "" + v + } + agree <- lapply(names(q_map), function(q) { + fld <- q_map[[q]] + if (!q %in% names(joined) || !fld %in% names(joined)) return(NULL) + a <- normalise_val(joined[[fld]]) + b <- normalise_val(joined[[q]]) + keep <- nzchar(a) & nzchar(b) + if (!any(keep)) return(NULL) + data.frame( + field = fld, q = q, n = sum(keep), + agree = sum(a[keep] == b[keep]), + rate = mean(a[keep] == b[keep]), + stringsAsFactors = FALSE + ) + }) + agree_df <- do.call(rbind, agree) + cat(" per-field agreement (human-coded ground truth):\n") + for (i in seq_len(nrow(agree_df))) { + cat(sprintf(" %-40s %d/%d = %.1f%%\n", + agree_df$field[i], agree_df$agree[i], agree_df$n[i], + 100 * agree_df$rate[i])) + } + overall_human <- weighted.mean(agree_df$rate, agree_df$n) + cat(sprintf(" weighted overall correctness: %.2f%%\n", + 100 * overall_human)) +} else { + cat(" skipped: install.packages('readxl') to enable Tier 3\n") + agree_df <- NULL + overall_human <- NA_real_ +} + +# -------------------- write JSON summary -------------------------- +summary <- list( + schema_match = schema_match, + ref_cols = length(ref_cols), + our_cols = length(our_cols), + sample_n = length(sample_drids), + parsed_n = sum(ok), + rate_rps = rate_rps, + parity_overall = unname(parity_overall), + parity_per_field = as.list(parity_rate), + tier3_overall = unname(overall_human), + tier3_per_field = if (!is.null(agree_df)) as.list(setNames( + agree_df$rate, agree_df$field)) else NULL +) +writeLines(jsonlite::toJSON(summary, pretty = TRUE, auto_unbox = TRUE), + out_path) +cat(sprintf("\nWrote summary to %s\n", out_path)) diff --git a/r-package/morie/src/.gitignore b/r-package/morie/src/.gitignore index 9d22eb46a9..445528743b 100644 --- a/r-package/morie/src/.gitignore +++ b/r-package/morie/src/.gitignore @@ -1,2 +1,4 @@ *.o *.so +Makevars +Makevars.win diff --git a/r-package/morie/src/Makevars b/r-package/morie/src/Makevars deleted file mode 100644 index e31e8b139d..0000000000 --- a/r-package/morie/src/Makevars +++ /dev/null @@ -1,3 +0,0 @@ -# The shared numeric core (morie_core.h) uses C++17 features -# (inline variables, nested namespaces). -CXX_STD = CXX17 diff --git a/r-package/morie/src/Makevars.in b/r-package/morie/src/Makevars.in new file mode 100644 index 0000000000..ab7c2b1d87 --- /dev/null +++ b/r-package/morie/src/Makevars.in @@ -0,0 +1,7 @@ +# The shared numeric core (morie_core.h) uses C++17 features. +CXX_STD = CXX17 +# The SIU parser (siu_parser.cpp) links libcurl; the flags below are +# filled in by ./configure (via curl-config) at install time, so the +# committed Makefile carries no non-portable shell-call extension. +PKG_CPPFLAGS = @cflags@ +PKG_LIBS = @libs@ diff --git a/r-package/morie/src/Makevars.win b/r-package/morie/src/Makevars.win deleted file mode 100644 index e31e8b139d..0000000000 --- a/r-package/morie/src/Makevars.win +++ /dev/null @@ -1,3 +0,0 @@ -# The shared numeric core (morie_core.h) uses C++17 features -# (inline variables, nested namespaces). -CXX_STD = CXX17 diff --git a/r-package/morie/src/Makevars.win.in b/r-package/morie/src/Makevars.win.in new file mode 100644 index 0000000000..f2c2d6af6d --- /dev/null +++ b/r-package/morie/src/Makevars.win.in @@ -0,0 +1,6 @@ +# The shared numeric core (morie_core.h) uses C++17 features. +CXX_STD = CXX17 +# The SIU parser (siu_parser.cpp) links libcurl; the flags below are +# filled in by ./configure.win (via pkg-config) at install time. +PKG_CPPFLAGS = @cflags@ +PKG_LIBS = @libs@ diff --git a/r-package/morie/src/RcppExports.cpp b/r-package/morie/src/RcppExports.cpp index ac6ab13c8d..f10fece3cf 100644 --- a/r-package/morie/src/RcppExports.cpp +++ b/r-package/morie/src/RcppExports.cpp @@ -59,65 +59,143 @@ BEGIN_RCPP END_RCPP } // morie_hawkes_ll_exp_const_cpp -double morie_hawkes_ll_exp_const_cpp(NumericVector t, double T, double a0, double eta, double beta); -RcppExport SEXP _morie_morie_hawkes_ll_exp_const_cpp(SEXP tSEXP, SEXP TSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP betaSEXP) { +double morie_hawkes_ll_exp_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double beta); +RcppExport SEXP _morie_morie_hawkes_ll_exp_const_cpp(SEXP tSEXP, SEXP T_horizonSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP betaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type t(tSEXP); - Rcpp::traits::input_parameter< double >::type T(TSEXP); + Rcpp::traits::input_parameter< double >::type T_horizon(T_horizonSEXP); Rcpp::traits::input_parameter< double >::type a0(a0SEXP); Rcpp::traits::input_parameter< double >::type eta(etaSEXP); Rcpp::traits::input_parameter< double >::type beta(betaSEXP); - rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_exp_const_cpp(t, T, a0, eta, beta)); + rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_exp_const_cpp(t, T_horizon, a0, eta, beta)); return rcpp_result_gen; END_RCPP } // morie_hawkes_ll_weibull_const_cpp -double morie_hawkes_ll_weibull_const_cpp(NumericVector t, double T, double a0, double eta, double alpha, double lam); -RcppExport SEXP _morie_morie_hawkes_ll_weibull_const_cpp(SEXP tSEXP, SEXP TSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP lamSEXP) { +double morie_hawkes_ll_weibull_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double lam); +RcppExport SEXP _morie_morie_hawkes_ll_weibull_const_cpp(SEXP tSEXP, SEXP T_horizonSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP lamSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type t(tSEXP); - Rcpp::traits::input_parameter< double >::type T(TSEXP); + Rcpp::traits::input_parameter< double >::type T_horizon(T_horizonSEXP); Rcpp::traits::input_parameter< double >::type a0(a0SEXP); Rcpp::traits::input_parameter< double >::type eta(etaSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type lam(lamSEXP); - rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_weibull_const_cpp(t, T, a0, eta, alpha, lam)); + rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_weibull_const_cpp(t, T_horizon, a0, eta, alpha, lam)); return rcpp_result_gen; END_RCPP } // morie_hawkes_ll_lomax_const_cpp -double morie_hawkes_ll_lomax_const_cpp(NumericVector t, double T, double a0, double eta, double alpha, double c); -RcppExport SEXP _morie_morie_hawkes_ll_lomax_const_cpp(SEXP tSEXP, SEXP TSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP cSEXP) { +double morie_hawkes_ll_lomax_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double c); +RcppExport SEXP _morie_morie_hawkes_ll_lomax_const_cpp(SEXP tSEXP, SEXP T_horizonSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP cSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type t(tSEXP); - Rcpp::traits::input_parameter< double >::type T(TSEXP); + Rcpp::traits::input_parameter< double >::type T_horizon(T_horizonSEXP); Rcpp::traits::input_parameter< double >::type a0(a0SEXP); Rcpp::traits::input_parameter< double >::type eta(etaSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type c(cSEXP); - rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_lomax_const_cpp(t, T, a0, eta, alpha, c)); + rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_lomax_const_cpp(t, T_horizon, a0, eta, alpha, c)); return rcpp_result_gen; END_RCPP } // morie_hawkes_ll_gamma_const_cpp -double morie_hawkes_ll_gamma_const_cpp(NumericVector t, double T, double a0, double eta, double alpha, double beta); -RcppExport SEXP _morie_morie_hawkes_ll_gamma_const_cpp(SEXP tSEXP, SEXP TSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP betaSEXP) { +double morie_hawkes_ll_gamma_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double beta); +RcppExport SEXP _morie_morie_hawkes_ll_gamma_const_cpp(SEXP tSEXP, SEXP T_horizonSEXP, SEXP a0SEXP, SEXP etaSEXP, SEXP alphaSEXP, SEXP betaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type t(tSEXP); - Rcpp::traits::input_parameter< double >::type T(TSEXP); + Rcpp::traits::input_parameter< double >::type T_horizon(T_horizonSEXP); Rcpp::traits::input_parameter< double >::type a0(a0SEXP); Rcpp::traits::input_parameter< double >::type eta(etaSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< double >::type beta(betaSEXP); - rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_gamma_const_cpp(t, T, a0, eta, alpha, beta)); + rcpp_result_gen = Rcpp::wrap(morie_hawkes_ll_gamma_const_cpp(t, T_horizon, a0, eta, alpha, beta)); + return rcpp_result_gen; +END_RCPP +} +// siu_http_get +std::string siu_http_get(std::string url, int timeout_s); +RcppExport SEXP _morie_siu_http_get(SEXP urlSEXP, SEXP timeout_sSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type url(urlSEXP); + Rcpp::traits::input_parameter< int >::type timeout_s(timeout_sSEXP); + rcpp_result_gen = Rcpp::wrap(siu_http_get(url, timeout_s)); + return rcpp_result_gen; +END_RCPP +} +// siu_curl_version +std::string siu_curl_version(); +RcppExport SEXP _morie_siu_curl_version() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(siu_curl_version()); + return rcpp_result_gen; +END_RCPP +} +// siu_http_get_many +Rcpp::CharacterVector siu_http_get_many(Rcpp::CharacterVector urls, int concurrency, int timeout_s, double rate_rps, int max_retries); +RcppExport SEXP _morie_siu_http_get_many(SEXP urlsSEXP, SEXP concurrencySEXP, SEXP timeout_sSEXP, SEXP rate_rpsSEXP, SEXP max_retriesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type urls(urlsSEXP); + Rcpp::traits::input_parameter< int >::type concurrency(concurrencySEXP); + Rcpp::traits::input_parameter< int >::type timeout_s(timeout_sSEXP); + Rcpp::traits::input_parameter< double >::type rate_rps(rate_rpsSEXP); + Rcpp::traits::input_parameter< int >::type max_retries(max_retriesSEXP); + rcpp_result_gen = Rcpp::wrap(siu_http_get_many(urls, concurrency, timeout_s, rate_rps, max_retries)); + return rcpp_result_gen; +END_RCPP +} +// siu_http_get_many_with_status +Rcpp::List siu_http_get_many_with_status(Rcpp::CharacterVector urls, int concurrency, int timeout_s, double rate_rps, int max_retries); +RcppExport SEXP _morie_siu_http_get_many_with_status(SEXP urlsSEXP, SEXP concurrencySEXP, SEXP timeout_sSEXP, SEXP rate_rpsSEXP, SEXP max_retriesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type urls(urlsSEXP); + Rcpp::traits::input_parameter< int >::type concurrency(concurrencySEXP); + Rcpp::traits::input_parameter< int >::type timeout_s(timeout_sSEXP); + Rcpp::traits::input_parameter< double >::type rate_rps(rate_rpsSEXP); + Rcpp::traits::input_parameter< int >::type max_retries(max_retriesSEXP); + rcpp_result_gen = Rcpp::wrap(siu_http_get_many_with_status(urls, concurrency, timeout_s, rate_rps, max_retries)); + return rcpp_result_gen; +END_RCPP +} +// siu_parse_report +Rcpp::CharacterVector siu_parse_report(std::string html, int drid, std::string url); +RcppExport SEXP _morie_siu_parse_report(SEXP htmlSEXP, SEXP dridSEXP, SEXP urlSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type html(htmlSEXP); + Rcpp::traits::input_parameter< int >::type drid(dridSEXP); + Rcpp::traits::input_parameter< std::string >::type url(urlSEXP); + rcpp_result_gen = Rcpp::wrap(siu_parse_report(html, drid, url)); + return rcpp_result_gen; +END_RCPP +} +// siu_parse_news +Rcpp::CharacterVector siu_parse_news(std::string html, int nrid, std::string url); +RcppExport SEXP _morie_siu_parse_news(SEXP htmlSEXP, SEXP nridSEXP, SEXP urlSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type html(htmlSEXP); + Rcpp::traits::input_parameter< int >::type nrid(nridSEXP); + Rcpp::traits::input_parameter< std::string >::type url(urlSEXP); + rcpp_result_gen = Rcpp::wrap(siu_parse_news(html, nrid, url)); return rcpp_result_gen; END_RCPP } @@ -131,6 +209,12 @@ static const R_CallMethodDef CallEntries[] = { {"_morie_morie_hawkes_ll_weibull_const_cpp", (DL_FUNC) &_morie_morie_hawkes_ll_weibull_const_cpp, 6}, {"_morie_morie_hawkes_ll_lomax_const_cpp", (DL_FUNC) &_morie_morie_hawkes_ll_lomax_const_cpp, 6}, {"_morie_morie_hawkes_ll_gamma_const_cpp", (DL_FUNC) &_morie_morie_hawkes_ll_gamma_const_cpp, 6}, + {"_morie_siu_http_get", (DL_FUNC) &_morie_siu_http_get, 2}, + {"_morie_siu_curl_version", (DL_FUNC) &_morie_siu_curl_version, 0}, + {"_morie_siu_http_get_many", (DL_FUNC) &_morie_siu_http_get_many, 5}, + {"_morie_siu_http_get_many_with_status", (DL_FUNC) &_morie_siu_http_get_many_with_status, 5}, + {"_morie_siu_parse_report", (DL_FUNC) &_morie_siu_parse_report, 3}, + {"_morie_siu_parse_news", (DL_FUNC) &_morie_siu_parse_news, 3}, {NULL, NULL, 0} }; diff --git a/r-package/morie/src/morie_fast.cpp b/r-package/morie/src/morie_fast.cpp index 120c727b64..5e0ebeb1ad 100644 --- a/r-package/morie/src/morie_fast.cpp +++ b/r-package/morie/src/morie_fast.cpp @@ -64,31 +64,31 @@ double morie_cor_pearson_cpp(NumericVector x, NumericVector y) { // returns 1e12 for an infeasible parameter vector. // [[Rcpp::export]] -double morie_hawkes_ll_exp_const_cpp(NumericVector t, double T, double a0, +double morie_hawkes_ll_exp_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double beta) { - return morie::core::hawkes_ll_exp_const(t.begin(), len(t), T, a0, eta, + return morie::core::hawkes_ll_exp_const(t.begin(), len(t), T_horizon, a0, eta, beta); } // [[Rcpp::export]] -double morie_hawkes_ll_weibull_const_cpp(NumericVector t, double T, double a0, +double morie_hawkes_ll_weibull_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double lam) { - return morie::core::hawkes_ll_weibull_const_trunc(t.begin(), len(t), T, + return morie::core::hawkes_ll_weibull_const_trunc(t.begin(), len(t), T_horizon, a0, eta, alpha, lam); } // [[Rcpp::export]] -double morie_hawkes_ll_lomax_const_cpp(NumericVector t, double T, double a0, +double morie_hawkes_ll_lomax_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double c) { - return morie::core::hawkes_ll_lomax_const(t.begin(), len(t), T, a0, eta, + return morie::core::hawkes_ll_lomax_const(t.begin(), len(t), T_horizon, a0, eta, alpha, c); } // [[Rcpp::export]] -double morie_hawkes_ll_gamma_const_cpp(NumericVector t, double T, double a0, +double morie_hawkes_ll_gamma_const_cpp(NumericVector t, double T_horizon, double a0, double eta, double alpha, double beta) { - return morie::core::hawkes_ll_gamma_const_trunc(t.begin(), len(t), T, a0, + return morie::core::hawkes_ll_gamma_const_trunc(t.begin(), len(t), T_horizon, a0, eta, alpha, beta); } diff --git a/r-package/morie/src/siu_parser.cpp b/r-package/morie/src/siu_parser.cpp new file mode 100644 index 0000000000..5c77f1cd82 --- /dev/null +++ b/r-package/morie/src/siu_parser.cpp @@ -0,0 +1,1186 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// siu_parser.cpp -- C/C++ parser for the Ontario Special Investigations +// Unit (SIU) corpus: director's reports and news releases. +// +// HTTP(S) transport is libcurl (linked via src/Makevars). This file is +// the foundation; the concurrent fetcher and the 64-field HTML parser +// are layered on top in later commits. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// parser_version stamped into every emitted row. +#define MORIE_SIU_PARSER_VERSION "0.2.0" + +namespace { + +// One-time libcurl global initialisation (libcurl requires this before +// any handle is created when the program is multi-threaded). +struct CurlGlobal { + CurlGlobal() { curl_global_init(CURL_GLOBAL_DEFAULT); } + // # nocov start + // -- the destructor runs only at process teardown (static-storage + // -- object); gcov does not reliably attribute exit-time execution. + ~CurlGlobal() { curl_global_cleanup(); } + // # nocov end +}; +const CurlGlobal kCurlGlobal; + +const char* kUserAgent = + "morie/0.9.5 (+https://github.com/hadesllm/morie)"; + +// libcurl write callback: append received bytes to a std::string. +size_t write_cb(char* ptr, size_t size, size_t nmemb, void* userdata) { + std::string* buf = static_cast(userdata); + const size_t n = size * nmemb; + buf->append(ptr, n); + return n; +} + +} // namespace + +//' Fetch a single URL over HTTP(S) via libcurl +//' +//' Internal building block of the SIU parser. Returns the response +//' body, or an empty string on any transport-level failure. +//' +//' @param url URL to fetch. +//' @param timeout_s Request timeout in seconds. +//' @return The response body as a length-1 character vector. +//' @keywords internal +// [[Rcpp::export(.siu_http_get)]] +std::string siu_http_get(std::string url, int timeout_s = 60) { + CURL* h = curl_easy_init(); + if (h == nullptr) return std::string(); + std::string buf; + curl_easy_setopt(h, CURLOPT_URL, url.c_str()); + curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_cb); + curl_easy_setopt(h, CURLOPT_WRITEDATA, &buf); + curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(h, CURLOPT_TIMEOUT, static_cast(timeout_s)); + curl_easy_setopt(h, CURLOPT_CONNECTTIMEOUT, 30L); + curl_easy_setopt(h, CURLOPT_USERAGENT, kUserAgent); + curl_easy_setopt(h, CURLOPT_ACCEPT_ENCODING, ""); // all supported + curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L); + const CURLcode rc = curl_easy_perform(h); + curl_easy_cleanup(h); + if (rc != CURLE_OK) return std::string(); + return buf; +} + +//' libcurl version string morie was built against +//' @return A length-1 character vector. +//' @keywords internal +// [[Rcpp::export(.siu_curl_version)]] +std::string siu_curl_version() { + return std::string(curl_version()); +} + +namespace { + +// One in-flight request. The body buffer accumulates over retries +// (cleared on each new attempt). `attempts` counts how many times the +// URL has been dispatched (1 on first send; retries bump it). +struct Req { + int idx; + std::string url; + std::string body; + long http_code; // last observed HTTP status (0 on transport failure) + int attempts; + long earliest_ns; // wall-clock ns before which the next attempt may start +}; + +// Configure a fresh easy handle for one SIU page fetch. +void setup_handle(CURL* e, const char* url, Req* r, long timeout_s) { + curl_easy_setopt(e, CURLOPT_URL, url); + curl_easy_setopt(e, CURLOPT_WRITEFUNCTION, write_cb); + curl_easy_setopt(e, CURLOPT_WRITEDATA, &r->body); + curl_easy_setopt(e, CURLOPT_PRIVATE, r); + curl_easy_setopt(e, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(e, CURLOPT_TIMEOUT, timeout_s); + curl_easy_setopt(e, CURLOPT_CONNECTTIMEOUT, 30L); + curl_easy_setopt(e, CURLOPT_USERAGENT, kUserAgent); + curl_easy_setopt(e, CURLOPT_ACCEPT_ENCODING, ""); + curl_easy_setopt(e, CURLOPT_NOSIGNAL, 1L); +} + +// Current monotonic clock in nanoseconds. +long now_ns() { + using namespace std::chrono; + return duration_cast( + steady_clock::now().time_since_epoch()).count(); +} + +// Core throttled multi-fetch. Returns the per-URL results in `reqs` +// (parallel to `urls`). All work is bounded by `rate_rps` request +// starts per second across the whole pool. Retries 429/503 up to +// `max_retries` with exponential backoff (250ms * 2^attempt). +void fetch_many_throttled(const std::vector& urls, + int concurrency, long timeout_s, + double rate_rps, int max_retries, + std::vector* reqs_out) { + const int n = static_cast(urls.size()); + std::vector& reqs = *reqs_out; + reqs.assign(n, nullptr); + for (int i = 0; i < n; ++i) { + reqs[i] = new Req{i, urls[i], std::string(), 0L, 0, 0L}; + } + if (n == 0) return; + if (concurrency < 1) concurrency = 1; + if (concurrency > n) concurrency = n; + if (rate_rps <= 0.0) rate_rps = 1e9; // effectively unthrottled + if (rate_rps > 1000.0) rate_rps = 1000.0; + if (max_retries < 0) max_retries = 0; + + // Minimum gap between request starts (in ns). + const long gap_ns = static_cast(1.0e9 / rate_rps); + long next_allowed = now_ns(); + + CURLM* multi = curl_multi_init(); + + // Queue of req indices ready to dispatch (FIFO for first attempt, + // then 429/503 retries get appended back at the tail with an + // earliest_ns gate). When the queue is empty and nothing is in + // flight, we're done. + std::vector queue; + queue.reserve(n); + for (int i = 0; i < n; ++i) queue.push_back(i); + + std::map in_flight; + + auto dispatch_one = [&](int idx) { + Req* r = reqs[idx]; + r->body.clear(); + r->attempts += 1; + CURL* e = curl_easy_init(); + setup_handle(e, r->url.c_str(), r, timeout_s); + curl_multi_add_handle(multi, e); + in_flight[e] = r; + }; + + while (!queue.empty() || !in_flight.empty()) { + // Top up the in-flight set, respecting both concurrency and rate. + while (!queue.empty() && static_cast(in_flight.size()) < concurrency) { + const int idx = queue.front(); + Req* r = reqs[idx]; + const long t = now_ns(); + // Per-request backoff gate (set when this req was a 429/503). + if (r->earliest_ns > t) break; + // Global rate-limit gate. + if (next_allowed > t) break; + queue.erase(queue.begin()); + dispatch_one(idx); + next_allowed = std::max(t, next_allowed) + gap_ns; + } + + int still_running = 0; + curl_multi_perform(multi, &still_running); + int numfds = 0; + // Poll up to 250ms so the rate-limit gate has a chance to open. + curl_multi_poll(multi, nullptr, 0, 250, &numfds); + + CURLMsg* msg = nullptr; + int msgs_left = 0; + while ((msg = curl_multi_info_read(multi, &msgs_left)) != nullptr) { + if (msg->msg != CURLMSG_DONE) continue; + CURL* e = msg->easy_handle; + Req* r = in_flight[e]; + long code = 0; + curl_easy_getinfo(e, CURLINFO_RESPONSE_CODE, &code); + r->http_code = code; + const CURLcode rc = msg->data.result; + curl_multi_remove_handle(multi, e); + curl_easy_cleanup(e); + in_flight.erase(e); + + const bool retryable = (rc != CURLE_OK) || + code == 429 || code == 502 || code == 503 || code == 504; + if (retryable && r->attempts <= max_retries) { + // Exponential backoff: 250ms, 500ms, 1000ms, 2000ms, ... + const long backoff_ns = + 250L * 1000000L * (1L << std::min(r->attempts - 1, 6)); + r->earliest_ns = now_ns() + backoff_ns; + queue.push_back(r->idx); // re-queue at tail + } + // Otherwise: this slot is final. r->body and r->http_code keep + // whatever the last attempt produced. + } + Rcpp::checkUserInterrupt(); + } + + curl_multi_cleanup(multi); +} + +} // namespace + +//' Fetch many URLs concurrently via libcurl, with rate-limiting + retry +//' +//' Drives up to \code{concurrency} simultaneous transfers, but with a +//' global token-bucket limit of \code{rate_rps} request starts per +//' second across the whole pool. HTTP 429/502/503/504 and transport +//' errors are retried up to \code{max_retries} times with exponential +//' backoff (250ms * 2^attempt). Final failures yield an empty string +//' at their slot. +//' +//' Throttling is the safe default for SIU and similar small-gov +//' endpoints: hammering them with 16-24 concurrent requests triggers +//' WAF/Cloudflare-style bot-protection that returns short +//' interstitial pages, which look like data but aren't. +//' +//' @param urls Character vector of URLs. +//' @param concurrency Maximum simultaneous transfers. +//' @param timeout_s Per-request timeout in seconds. +//' @param rate_rps Maximum request starts per second across the pool. +//' Default \code{4.0} is a polite scrape rate that stays well under +//' any common WAF threshold. Set very large (e.g. \code{1e9}) to +//' effectively disable throttling. +//' @param max_retries Maximum retry attempts per URL on 429/5xx / +//' transport failure. +//' @return A character vector of response bodies, parallel to \code{urls}. +//' @keywords internal +// [[Rcpp::export(.siu_http_get_many)]] +Rcpp::CharacterVector siu_http_get_many(Rcpp::CharacterVector urls, + int concurrency = 4, + int timeout_s = 60, + double rate_rps = 4.0, + int max_retries = 3) { + const int n = urls.size(); + Rcpp::CharacterVector out(n); + for (int i = 0; i < n; ++i) out[i] = ""; + if (n == 0) return out; + + std::vector u(n); + for (int i = 0; i < n; ++i) u[i] = std::string(urls[i]); + + std::vector reqs; + fetch_many_throttled(u, concurrency, static_cast(timeout_s), + rate_rps, max_retries, &reqs); + + for (Req* r : reqs) { + out[r->idx] = r->body; + delete r; + } + return out; +} + +//' Fetch many URLs and return body + http_code + attempts +//' +//' Same throttle/retry behaviour as \code{.siu_http_get_many} but the +//' return value preserves the HTTP status code and attempt count for +//' each URL, so callers can distinguish a healthy 200 with a small +//' body from a 429/503/short interstitial. Used by the DRID manifest +//' builder (\code{morie_siu_refresh_manifest}). +//' +//' @inheritParams siu_http_get_many +//' @return A list with three parallel slots: \code{body} (character), +//' \code{http_code} (integer), \code{attempts} (integer). +//' @keywords internal +// [[Rcpp::export(.siu_http_get_many_with_status)]] +Rcpp::List siu_http_get_many_with_status(Rcpp::CharacterVector urls, + int concurrency = 4, + int timeout_s = 60, + double rate_rps = 4.0, + int max_retries = 3) { + const int n = urls.size(); + Rcpp::CharacterVector body(n); + Rcpp::IntegerVector code(n); + Rcpp::IntegerVector attempts(n); + for (int i = 0; i < n; ++i) { + body[i] = ""; + code[i] = NA_INTEGER; + attempts[i] = 0; + } + if (n == 0) { + return Rcpp::List::create(Rcpp::Named("body") = body, + Rcpp::Named("http_code") = code, + Rcpp::Named("attempts") = attempts); + } + + std::vector u(n); + for (int i = 0; i < n; ++i) u[i] = std::string(urls[i]); + + std::vector reqs; + fetch_many_throttled(u, concurrency, static_cast(timeout_s), + rate_rps, max_retries, &reqs); + + for (Req* r : reqs) { + body[r->idx] = r->body; + code[r->idx] = static_cast(r->http_code); + attempts[r->idx] = r->attempts; + delete r; + } + return Rcpp::List::create(Rcpp::Named("body") = body, + Rcpp::Named("http_code") = code, + Rcpp::Named("attempts") = attempts); +} + +// =========================================================================== +// HTML parsing -- SIU director's-report pages -> the 64-column schema. +// =========================================================================== + +namespace { + +// Canonical 64-column order of the SIU dataset. +const std::vector kSiuCols = { + "case_number", "drid", "nrid", "source_url_report", "source_url_news", + "scraped_at_utc", "parser_version", "date_of_incident_iso", + "date_of_incident_raw", "time_of_incident_raw", "date_of_injury_iso", + "date_of_injury_raw", "incident_to_injury_raw", "date_siu_notified_iso", + "date_siu_notified_raw", "time_of_notification_raw", "notifying_party", + "notifying_party_other_text", "date_of_director_decision_iso", + "date_of_director_decision_raw", "time_of_director_decision_raw", + "siu_investigators", "siu_forensics_investigators", "police_service", + "number_of_officers_involved", "location_of_call", + "type_of_building_or_scene", "reason_for_interaction", "injuries_sustained", + "injuries_other_text", "specific_injuries", "location_of_treatment", + "number_of_affected_persons", "sex_gender_affected", "age_affected", + "affected_interviewed", "date_of_affected_interview_iso", + "date_of_affected_interview_raw", "number_of_civilian_witnesses", + "date_of_witness_interview_raw", "number_of_subject_officials", + "subject_official_interviewed_or_notes", "date_of_subject_interview_raw", + "number_of_witness_officials", "date_of_witness_official_interview_raw", + "evidence_types", "evidence_other_text", "evidence_features", + "narrative_summary", "relevant_legislation", "legislation_other_text", + "weapons_or_force_used", "weapons_other_text", "charges_recommended", + "directors_decision_reasonable", "supplemental_materials", + "news_links_extra", "mental_health_or_race_indications", "_language", + "news_release_title", "news_release_date_iso", "news_release_date_raw", + "news_release_summary", "directors_name" +}; + +// Decode the HTML entities that occur in SIU pages. +std::string decode_entities(std::string s) { + struct E { const char* k; const char* v; }; + static const E named[] = { + {"&", "&"}, {"<", "<"}, {">", ">"}, {""", "\""}, + {"'", "'"}, {" ", " "}, {"’", "'"}, {"‘", "'"}, + {"“", "\""}, {"”", "\""}, {"–", "-"}, {"—", "-"}, + {"…", "..."}, {"é", "\xC3\xA9"}, {"è", "\xC3\xA8"}, + {"à", "\xC3\xA0"}, {"ç", "\xC3\xA7"}, {"â", "\xC3\xA2"}, + {"ê", "\xC3\xAA"}, {"î", "\xC3\xAE"}, {"ô", "\xC3\xB4"}, + {"û", "\xC3\xBB"}, {"ï", "\xC3\xAF"}, {"ë", "\xC3\xAB"}, + {"ù", "\xC3\xB9"}, {"ì", "\xC3\xAC"}, {"É", "\xC3\x89"}, + {"œ", "\xC5\x93"}, {"«", "\xC2\xAB"}, {"»", "\xC2\xBB"}, + {"'", "'"} + }; + for (const E& e : named) { + std::string::size_type p = 0; + while ((p = s.find(e.k, p)) != std::string::npos) { + s.replace(p, std::string(e.k).size(), e.v); + p += std::string(e.v).size(); + } + } + // Numeric entities &#NN; / &#xNN; -- decode the ASCII range; map a few + // common Unicode punctuation points to their ASCII equivalents. + static const std::regex num_re("&#(x?[0-9A-Fa-f]+);"); + std::string out; + out.reserve(s.size()); + auto begin = std::sregex_iterator(s.begin(), s.end(), num_re); + auto end = std::sregex_iterator(); + std::string::size_type last = 0; + for (auto it = begin; it != end; ++it) { + out.append(s, last, it->position() - last); + std::string tok = (*it)[1].str(); + long cp = (tok[0] == 'x') ? std::strtol(tok.c_str() + 1, nullptr, 16) + : std::strtol(tok.c_str(), nullptr, 10); + if (cp == 8217 || cp == 8216 || cp == 8242) out.push_back('\''); + else if (cp == 8220 || cp == 8221) out.push_back('"'); + else if (cp == 8211 || cp == 8212) out.push_back('-'); + else if (cp >= 32 && cp < 127) out.push_back(static_cast(cp)); + else out.push_back(' '); + last = it->position() + it->length(); + } + out.append(s, last, std::string::npos); + return out; +} + +// Collapse runs of whitespace to single spaces and trim. +std::string squeeze(const std::string& s) { + std::string out; + out.reserve(s.size()); + bool sp = false; + for (char c : s) { + const bool ws = (c == ' ' || c == '\t' || c == '\n' || c == '\r' || + c == '\f' || c == '\v'); + if (ws) { sp = true; continue; } + if (sp && !out.empty()) out.push_back(' '); + sp = false; + out.push_back(c); + } + return out; +} + +// Case-insensitive substring search starting at pos. Returns npos if +// not found. Used by the linear HTML stripper below. +std::string::size_type ifind(const std::string& s, const char* needle, + std::string::size_type pos) { + const std::size_t n = std::strlen(needle); + if (n == 0) return pos; + for (std::string::size_type i = pos; i + n <= s.size(); ++i) { + std::string::size_type k = 0; + for (; k < n; ++k) { + const char a = s[i + k]; + const char b = needle[k]; + const char la = (a >= 'A' && a <= 'Z') ? (a + ('a' - 'A')) : a; + const char lb = (b >= 'A' && b <= 'Z') ? (b + ('a' - 'A')) : b; + if (la != lb) break; + } + if (k == n) return i; + } + return std::string::npos; +} + +// Strip all HTML markup from a fragment and return decoded plain text. +// +// Linear state-machine implementation (one pass, no recursion). The +// previous std::regex_replace("]*>.*?") form blew +// the C stack via catastrophic backtracking on at least one drid in +// the 1..6000 sweep, killing the whole manifest job. We can't catch a +// stack overflow in C++, so the only safe fix is to remove the +// recursive matcher entirely. +// +// Inputs are also defensively bounded to 4 MB before scanning. SIU +// report pages run ~50-100 kB; anything larger is malformed or a +// runaway server response, and parsing it accomplishes nothing. +std::string html_to_text(std::string h) { + static const std::size_t kMaxBytes = 4u * 1024u * 1024u; // 4 MB + if (h.size() > kMaxBytes) h.resize(kMaxBytes); + + std::string out; + out.reserve(h.size()); + std::string::size_type i = 0; + const std::string::size_type n = h.size(); + + while (i < n) { + if (h[i] != '<') { out.push_back(h[i]); ++i; continue; } + + // Look for ]*> or ]*>: must skip past the + // matching close tag, replacing the whole span with a space. + const bool is_script = (i + 7 <= n) && + (ifind(h.substr(i, 7), "" : ""; + std::string::size_type end = ifind(h, close, i); + if (end == std::string::npos) { + // Unclosed -- treat rest of input as junk and bail. + out.push_back(' '); + break; + } + end += std::strlen(close); + out.push_back(' '); + i = end; + continue; + } + + // Any other tag: skip to '>' and emit a space. + std::string::size_type gt = h.find('>', i + 1); + if (gt == std::string::npos) { + // Unterminated tag -- treat rest as text. + out.append(h, i, std::string::npos); + break; + } + out.push_back(' '); + i = gt + 1; + } + + return squeeze(decode_entities(out)); +} + +// Lowercase-fold an ASCII string AND normalise common Unicode +// punctuation to ASCII equivalents so substring-matching against a +// C-string keyword works. SIU h2 headings use typographic +// apostrophes (U+2019 'Director's report') and curly quotes; without +// this normalisation, a keyword like "director's report" with an +// ASCII apostrophe would silently fail to match. Cheap, no allocs. +std::string lower_ascii(std::string s) { + // U+2019 = E2 80 99, U+2018 = E2 80 98, U+201C = E2 80 9C, + // U+201D = E2 80 9D, U+2013 = E2 80 93 (en-dash), + // U+2014 = E2 80 94 (em-dash). Replace each 3-byte UTF-8 sequence + // with a single ASCII char, in-place by overwrite + erase. + std::string out; + out.reserve(s.size()); + for (std::string::size_type i = 0; i < s.size(); ) { + const unsigned char c = static_cast(s[i]); + if (c == 0xE2 && i + 2 < s.size()) { + const unsigned char c1 = static_cast(s[i + 1]); + const unsigned char c2 = static_cast(s[i + 2]); + if (c1 == 0x80) { + char repl = 0; + if (c2 == 0x98 || c2 == 0x99) repl = '\''; // curly apostrophes + else if (c2 == 0x9C || c2 == 0x9D) repl = '"'; // curly quotes + else if (c2 == 0x93 || c2 == 0x94) repl = '-'; // en/em dashes + if (repl != 0) { out.push_back(repl); i += 3; continue; } + } + } + out.push_back((c >= 'A' && c <= 'Z') ? c + ('a' - 'A') : s[i]); + ++i; + } + return out; +} + +// Find the bounds of the section whose

heading text contains +// `title_keyword` (case-insensitive substring). The SIU site has +// two template families that flip the section_5/section_6 ordering +// between "Evidence" and "Incident Narrative", so looking up by +// title is robust where hard-coded numbers are not. +// +// Returns {start, end} where start is just past the `

...

` +// closing tag and end is at the next

+section_bounds_by_title(const std::string& html, + const char* title_keyword) { + const std::string kw = lower_ascii(title_keyword); + std::string::size_type pos = 0; + while (true) { + const std::string::size_type h2 = html.find("', h2); + if (gt == std::string::npos) break; + const std::string::size_type close = html.find("

", gt); + if (close == std::string::npos) break; + const std::string heading_text = + lower_ascii(html.substr(gt + 1, close - gt - 1)); + if (heading_text.find(kw) != std::string::npos) { + const std::string::size_type body = close + 5; // past

+ std::string::size_type end = html.size(); + for (const char* terminator : {" heading matches `title_keyword`. +// Falls back to an empty string when no matching section exists -- +// e.g. some short reports omit the legislation section entirely. +std::string section_text_by_title(const std::string& html, + const char* title_keyword) { + auto bounds = section_bounds_by_title(html, title_keyword); + if (bounds.first == std::string::npos) return std::string(); + return html_to_text(html.substr(bounds.first, + bounds.second - bounds.first)); +} + +// Plain text of the report section whose

carries id="section_". +// Stops at the NEXT

(the next section heading) OR at the first +// page-chrome boundary ( follows it) silently includes the site footer, which +// leaks left-nav phrases like "First Nations, Inuit and Metis +// Liaison Program" into every report's narrative_summary, +// supplemental_materials, and mental_health_or_race_indications. +std::string section_text(const std::string& html, int n) { + const std::string anchor = "id=\"section_" + std::to_string(n) + "\""; + std::string::size_type a = html.find(anchor); + if (a == std::string::npos) return std::string(); + std::string::size_type body = html.find('>', a); + if (body == std::string::npos) return std::string(); + ++body; + std::string::size_type b = html.size(); + for (const char* terminator : {" 1) return m[1].str(); + } catch (...) {} + return std::string(); +} + +// Highest N among "