diff --git a/src/odr/html.hpp b/src/odr/html.hpp index 9dd92bd4..82151d63 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -65,6 +65,21 @@ enum class HtmlTableGridlines { hard, }; +/// @brief PDF text rendering mode. +/// +/// Selects how text is emitted in PDF→HTML output. +/// +/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a +/// separate transparent selection/search layer (reading order, real Unicode). +/// Similar to pdf.js. No JavaScript required. +/// - `single_layer`: A single combined layer where every glyph is mapped to +/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript +/// required. +enum class PdfTextMode { + dual_layer, + single_layer, +}; + /// @brief HTML configuration. struct HtmlConfig { // document output file names @@ -106,6 +121,9 @@ struct HtmlConfig { std::string background_image_format{"png"}; double background_image_dpi{144.0}; + // PDF text mode + PdfTextMode pdf_text_mode{PdfTextMode::dual_layer}; + // drm options bool no_drm{false}; diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07a250d1..372ae3b6 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -472,13 +472,12 @@ class PatternRegistry : public DefsRegistry { }; /// Deduplicates CSS declarations into atomic, single-property classes. PDF text -/// emits one absolutely-positioned span per glyph run, and the same font sizes, -/// offsets and spacings recur across the (potentially millions of) spans. -/// Writing each declaration inline bloats the document — the Bluetooth Core -/// spec reference output crossed GitHub's 100 MB file limit. Instead, every -/// distinct declaration is registered once here, named `` in +/// emits one absolutely-positioned line block per detected line, and the same +/// font sizes, offsets and spacings recur across the (potentially millions of) +/// elements. Writing each declaration inline bloats the document. Instead, +/// every distinct declaration is registered once here, named `` in /// first-seen order (e.g. `f1`, `f2` for font sizes, `t1` for a top offset), -/// emitted once in , and referenced by class on each span. This is +/// emitted once in , and referenced by class on each element. This is /// representation-only: the computed style of every element is unchanged. class AtomicStyles { public: @@ -530,7 +529,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return true; } - return false; } @@ -538,7 +536,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return "text/html"; } - throw FileNotFound("Unknown path: " + path); } @@ -548,7 +545,6 @@ class HtmlServiceImpl final : public HtmlService { write_document(writer); return; } - throw FileNotFound("Unknown path: " + path); } @@ -557,104 +553,113 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return write_document(out); } - throw FileNotFound("Unknown path: " + path); } - // One emitted span. The styling is fully resolved into class tokens during - // the first pass; only the (already escaped) text and class list survive to - // the writing pass. A text run with an embedded font emits the dual layer as - // a transparent selectable span carrying the real Unicode with the visible - // glyph layer (PUA code points in the `@font-face` font) nested inside it: - // the child is absolutely positioned at the run origin and inherits the - // font size, spacing, and transform from the parent, so the placement - // classes live only on the parent. `glyph_classes` is empty when there is no - // nested layer (the legacy fallback path and display-only runs). - struct SpanOut { + HtmlResources write_document(HtmlWriter &out) const { + if (config().pdf_text_mode == PdfTextMode::single_layer) { + return write_document_single_layer(out); + } + return write_document_dual_layer(out); + } + + // ========================================================================= + // DUAL-LAYER MODE + // ========================================================================= + // + // Two separate layers per page: + // + // Visual layer (`