diff --git a/src/odr/html.hpp b/src/odr/html.hpp
index 9dd92bd4..82151d63 100644
--- a/src/odr/html.hpp
+++ b/src/odr/html.hpp
@@ -65,6 +65,21 @@ enum class HtmlTableGridlines {
hard,
};
+/// @brief PDF text rendering mode.
+///
+/// Selects how text is emitted in PDF→HTML output.
+///
+/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a
+/// separate transparent selection/search layer (reading order, real Unicode).
+/// Similar to pdf.js. No JavaScript required.
+/// - `single_layer`: A single combined layer where every glyph is mapped to
+/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript
+/// required.
+enum class PdfTextMode {
+ dual_layer,
+ single_layer,
+};
+
/// @brief HTML configuration.
struct HtmlConfig {
// document output file names
@@ -106,6 +121,9 @@ struct HtmlConfig {
std::string background_image_format{"png"};
double background_image_dpi{144.0};
+ // PDF text mode
+ PdfTextMode pdf_text_mode{PdfTextMode::dual_layer};
+
// drm options
bool no_drm{false};
diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp
index 07a250d1..372ae3b6 100644
--- a/src/odr/internal/html/pdf_file.cpp
+++ b/src/odr/internal/html/pdf_file.cpp
@@ -472,13 +472,12 @@ class PatternRegistry : public DefsRegistry {
};
/// Deduplicates CSS declarations into atomic, single-property classes. PDF text
-/// emits one absolutely-positioned span per glyph run, and the same font sizes,
-/// offsets and spacings recur across the (potentially millions of) spans.
-/// Writing each declaration inline bloats the document — the Bluetooth Core
-/// spec reference output crossed GitHub's 100 MB file limit. Instead, every
-/// distinct declaration is registered once here, named `` in
+/// emits one absolutely-positioned line block per detected line, and the same
+/// font sizes, offsets and spacings recur across the (potentially millions of)
+/// elements. Writing each declaration inline bloats the document. Instead,
+/// every distinct declaration is registered once here, named `` in
/// first-seen order (e.g. `f1`, `f2` for font sizes, `t1` for a top offset),
-/// emitted once in , and referenced by class on each span. This is
+/// emitted once in , and referenced by class on each element. This is
/// representation-only: the computed style of every element is unchanged.
class AtomicStyles {
public:
@@ -530,7 +529,6 @@ class HtmlServiceImpl final : public HtmlService {
if (path == "document.html") {
return true;
}
-
return false;
}
@@ -538,7 +536,6 @@ class HtmlServiceImpl final : public HtmlService {
if (path == "document.html") {
return "text/html";
}
-
throw FileNotFound("Unknown path: " + path);
}
@@ -548,7 +545,6 @@ class HtmlServiceImpl final : public HtmlService {
write_document(writer);
return;
}
-
throw FileNotFound("Unknown path: " + path);
}
@@ -557,104 +553,113 @@ class HtmlServiceImpl final : public HtmlService {
if (path == "document.html") {
return write_document(out);
}
-
throw FileNotFound("Unknown path: " + path);
}
- // One emitted span. The styling is fully resolved into class tokens during
- // the first pass; only the (already escaped) text and class list survive to
- // the writing pass. A text run with an embedded font emits the dual layer as
- // a transparent selectable span carrying the real Unicode with the visible
- // glyph layer (PUA code points in the `@font-face` font) nested inside it:
- // the child is absolutely positioned at the run origin and inherits the
- // font size, spacing, and transform from the parent, so the placement
- // classes live only on the parent. `glyph_classes` is empty when there is no
- // nested layer (the legacy fallback path and display-only runs).
- struct SpanOut {
+ HtmlResources write_document(HtmlWriter &out) const {
+ if (config().pdf_text_mode == PdfTextMode::single_layer) {
+ return write_document_single_layer(out);
+ }
+ return write_document_dual_layer(out);
+ }
+
+ // =========================================================================
+ // DUAL-LAYER MODE
+ // =========================================================================
+ //
+ // Two separate layers per page:
+ //
+ // Visual layer (`
`): paint-order glyph
+ // rendering. Text runs are grouped into line blocks (`
`)
+ // by baseline; runs within a block flow inline, each nudged by a
+ // `margin-left`. A path or image in paint order closes the open block and
+ // is emitted into an SVG; the next text opens a fresh block. Fonts are
+ // re-encoded to the PUA (no real-Unicode cmap entries needed — the visual
+ // layer is `user-select:none`). Invisible text (Tr 3/7) is omitted here.
+ //
+ // Selection layer (`
`): transparent, selectable real
+ // Unicode. Text runs are grouped into per-line divs (`
`) in content-stream order; space detection inserts separator spans
+ // on line/column breaks or wide gaps. Each run span is
+ // `display:inline-block; width:Xpx` with CSS `text-align:justify;
+ // text-align-last:justify; text-justify:inter-character` so the browser
+ // spreads the characters to fill the PDF advance without JavaScript.
+ // For gap spans between runs a zero-content `display:inline-block;
+ // width:Ypx` span is emitted.
+
+ // One run inside a visual line block. `classes` carries margin-left, font
+ // size, font-family+colour — the line block holds placement only.
+ struct VisRunOut {
std::string classes;
- std::string text;
- std::string glyph_classes;
- std::string glyph_text;
+ std::string text; // PUA glyph string (or real unicode for fallback path)
+ };
+ // One line block in the visual layer: absolutely positioned at the first
+ // run's origin. Runs flow inline, each nudged by margin-left.
+ struct VisLineOut {
+ std::string classes; // "t lN tN [mN]" (or matrix transform)
+ std::vector runs;
};
- // One vector item, already serialized to an SVG fragment in the page's
- // viewBox (PDF points, y-down): a painted `` or an ``.
- // Contiguous vector items share one `