From b7b673302207cf8241200173a2bf263a8a2b4bf9 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 1 Jul 2026 19:06:55 +0200 Subject: [PATCH 1/5] pdf: add PdfTextMode config option to HtmlConfig Introduces a `PdfTextMode` enum with two values: - `dual_layer`: visual (PUA glyphs, paint order) + transparent Unicode selection/search layer. Default. - `single_layer`: single combined layer with frequency-based Unicode mapping, similar to pdf2htmlEX. The active mode is controlled by `HtmlConfig::pdf_text_mode`. Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_01Mq2d2eFjjCL8cHpU9pHugq --- src/odr/html.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/odr/html.hpp b/src/odr/html.hpp index 9dd92bd4..82151d63 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -65,6 +65,21 @@ enum class HtmlTableGridlines { hard, }; +/// @brief PDF text rendering mode. +/// +/// Selects how text is emitted in PDF→HTML output. +/// +/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a +/// separate transparent selection/search layer (reading order, real Unicode). +/// Similar to pdf.js. No JavaScript required. +/// - `single_layer`: A single combined layer where every glyph is mapped to +/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript +/// required. +enum class PdfTextMode { + dual_layer, + single_layer, +}; + /// @brief HTML configuration. struct HtmlConfig { // document output file names @@ -106,6 +121,9 @@ struct HtmlConfig { std::string background_image_format{"png"}; double background_image_dpi{144.0}; + // PDF text mode + PdfTextMode pdf_text_mode{PdfTextMode::dual_layer}; + // drm options bool no_drm{false}; From 74f51ee76fdf27deb26fe7f5c6eb26dc646eeb56 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Wed, 1 Jul 2026 19:07:29 +0200 Subject: [PATCH 2/5] pdf: dual-layer and single-layer text rendering (#577 #578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the single-glyph-per-absolute-span approach with two modes, both using line blocks (position:absolute on the line div, margin-left on inline run spans) instead of per-glyph absolute positioning. Dual-layer mode (default, PdfTextMode::dual_layer): - Visual layer (
): paint-order glyph rendering. Fonts re-encoded to PUA. Invisible text omitted. - Selection layer (
): transparent real-Unicode text. Runs grouped into line blocks by baseline; space detection inserts gap spans. Each run span is display:inline-block with CSS justify (text-align:justify; text-align-last:justify; text-justify:inter- character) so characters fill the PDF advance without JavaScript. - Similar approach to pdf.js. Single-layer mode (PdfTextMode::single_layer): - One combined layer per page in paint order. - Pre-pass frequency analysis: counts (uchar, glyph) co-occurrences per font, then picks the most-frequent glyph as the cmap entry — so the common case wins, not first-come-first-serve. - Clean runs (all uchar→glyph pairs match the winner) render the real Unicode directly in the embedded font — natively selectable. - Unclean runs paint glyphs via ::before{content:attr(data-g)} with a zero-width display:inline-block overlay span for selectability. - PUA-only chars (no Unicode mapping) remain visible but unselectable. - Similar approach to pdf2htmlEX. Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_01Mq2d2eFjjCL8cHpU9pHugq --- src/odr/internal/html/pdf_file.cpp | 1443 ++++++++++++++++++++-------- 1 file changed, 1039 insertions(+), 404 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07a250d1..39f9cf0a 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -472,13 +472,12 @@ class PatternRegistry : public DefsRegistry { }; /// Deduplicates CSS declarations into atomic, single-property classes. PDF text -/// emits one absolutely-positioned span per glyph run, and the same font sizes, -/// offsets and spacings recur across the (potentially millions of) spans. -/// Writing each declaration inline bloats the document — the Bluetooth Core -/// spec reference output crossed GitHub's 100 MB file limit. Instead, every -/// distinct declaration is registered once here, named `` in +/// emits one absolutely-positioned line block per detected line, and the same +/// font sizes, offsets and spacings recur across the (potentially millions of) +/// elements. Writing each declaration inline bloats the document. Instead, +/// every distinct declaration is registered once here, named `` in /// first-seen order (e.g. `f1`, `f2` for font sizes, `t1` for a top offset), -/// emitted once in , and referenced by class on each span. This is +/// emitted once in , and referenced by class on each element. This is /// representation-only: the computed style of every element is unchanged. class AtomicStyles { public: @@ -530,7 +529,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return true; } - return false; } @@ -538,7 +536,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return "text/html"; } - throw FileNotFound("Unknown path: " + path); } @@ -548,7 +545,6 @@ class HtmlServiceImpl final : public HtmlService { write_document(writer); return; } - throw FileNotFound("Unknown path: " + path); } @@ -557,104 +553,113 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return write_document(out); } - throw FileNotFound("Unknown path: " + path); } - // One emitted span. The styling is fully resolved into class tokens during - // the first pass; only the (already escaped) text and class list survive to - // the writing pass. A text run with an embedded font emits the dual layer as - // a transparent selectable span carrying the real Unicode with the visible - // glyph layer (PUA code points in the `@font-face` font) nested inside it: - // the child is absolutely positioned at the run origin and inherits the - // font size, spacing, and transform from the parent, so the placement - // classes live only on the parent. `glyph_classes` is empty when there is no - // nested layer (the legacy fallback path and display-only runs). - struct SpanOut { + HtmlResources write_document(HtmlWriter &out) const { + if (config().pdf_text_mode == PdfTextMode::single_layer) { + return write_document_single_layer(out); + } + return write_document_dual_layer(out); + } + + // ========================================================================= + // DUAL-LAYER MODE + // ========================================================================= + // + // Two separate layers per page: + // + // Visual layer (`