diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 17d590ca..3868b7f6 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -205,6 +205,7 @@ jobs: with: token: ${{ secrets.PAT_ANDIWAND }} submodules: true + lfs: true - name: ubuntu install tidy if: runner.os == 'Linux' diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07a250d1..1056fb26 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -20,8 +20,6 @@ #include #include -#include - #include #include #include @@ -41,6 +39,20 @@ namespace { /// the extra digits add up across a page full of path data. double round2(const double v) { return std::round(v * 100.0) / 100.0; } +/// Escape only HTML markup (`&`, `<`, `>`) for the selection layer. Unlike +/// `html::escape_text`, spaces are left as ordinary U+0020 rather than +/// rewritten to ` `: the selection spans carry `white-space:pre`, so every +/// space already renders, and a non-breaking space would defeat the layer's +/// purpose — it doesn't match a normal space in find-in-page and it glues +/// adjacent words together under double-click. Tabs aren't expected in +/// extracted PDF text. +std::string escape_selection_text(std::string text) { + util::string::replace_all(text, "&", "&"); + util::string::replace_all(text, "<", "<"); + util::string::replace_all(text, ">", ">"); + return text; +} + /// Serialize a transform as an SVG `matrix(...)`. Only the translation (e, f) /// is rounded — it lives in page-box units where 1/100 px is plenty; the linear /// part (a..d) keeps full precision so small scale/skew factors aren't @@ -561,20 +573,34 @@ class HtmlServiceImpl final : public HtmlService { throw FileNotFound("Unknown path: " + path); } - // One emitted span. The styling is fully resolved into class tokens during - // the first pass; only the (already escaped) text and class list survive to - // the writing pass. A text run with an embedded font emits the dual layer as - // a transparent selectable span carrying the real Unicode with the visible - // glyph layer (PUA code points in the `@font-face` font) nested inside it: - // the child is absolutely positioned at the run origin and inherits the - // font size, spacing, and transform from the parent, so the placement - // classes live only on the parent. `glyph_classes` is empty when there is no - // nested layer (the legacy fallback path and display-only runs). + // One emitted span: the resolved class tokens plus the already-escaped text. + // The renderer paints text in two independent layers (see `write_document`): + // the **visual** layer (`PageOut::items`, in paint order) carries the + // unselectable glyphs as a flat list of positioned spans; the **selection** + // layer (`PageOut::sel_lines`, in content/reading order) carries the + // transparent, selectable real Unicode grouped into per-line flow blocks + // (`LineOut`), each holding inline run spans. `SpanOut` is the shared leaf of + // both — a visual glyph span, or one flowed selection run. struct SpanOut { std::string classes; std::string text; - std::string glyph_classes; - std::string glyph_text; + // Selection layer only: the run's true advance in CSS px, emitted as + // `data-w` so the on-load fit script can stretch/squeeze the transparent + // span to the real glyph width with `letter-spacing` (the system fallback + // font it renders in has its own, different advances). For an inter-run + // separator it is the gap width, so the flowed runs land at their true + // x-offsets. 0 on visual spans and on unfitted runs — no attribute written. + double width{0}; + }; + // The selection layer is grouped into per-line flow blocks: one + // absolutely-positioned container per PDF line (its `classes` carry the line + // origin placement plus `.i`), holding inline run ``s that *flow*. This + // is what makes native within-line selection, double-click and find-in-page + // work and keeps the run boxes real — see `write_document` and the on-load + // fit script. + struct LineOut { + std::string classes; + std::vector runs; }; // One vector item, already serialized to an SVG fragment in the page's // viewBox (PDF points, y-down): a painted `` or an ``. @@ -582,14 +608,19 @@ class HtmlServiceImpl final : public HtmlService { struct PathOut { std::string svg; }; - // Page content in paint (z) order: text spans and paths interleave, so a - // later fill occludes earlier text and vice versa. + // Visual page content in paint (z) order: glyph spans and paths interleave, + // so a later fill occludes earlier text and vice versa. using PageItem = std::variant; struct PageOut { std::string classes; double width{0}; // page box width, PDF points (for the SVG viewBox) double height{0}; // page box height, PDF points std::vector items; + // The selection layer: transparent, selectable Unicode grouped into + // per-line flow blocks in content-stream (reading) order, emitted after the + // visual content so they form one contiguous, cleanly selectable run in the + // DOM. + std::vector sel_lines; // `` defs for this page's clipped paths, emitted once in a hidden // ``; the path fragments reference them by id. Empty when no path on // the page is clipped. @@ -621,22 +652,14 @@ class HtmlServiceImpl final : public HtmlService { // whose embedded font is absent, not an SFNT, or not re-encodable keeps // index 0 and renders through the fallback path, exactly as before. // - // The `@font-face` rules are *not* built here: a font also gets - // real-Unicode `cmap` entries for the scalars its 1:1 runs use (so those - // runs can collapse to a single span), and that used-scalar set is only - // complete after the first pass. `font_family` therefore just validates and - // indexes the font; `accepted_fonts` / `used_unicode` (indexed by `index - - // 1`) carry it to the post-pass that re-encodes with the extra entries and - // emits `font_faces`. + // The `@font-face` rules are *not* built here: the font subset isn't needed + // until the post-pass, which re-encodes each accepted font to the PUA and + // emits `font_faces`. `font_family` therefore just validates and indexes + // the font; `accepted_fonts` (indexed by `index - 1`) carries it forward. std::uint32_t family_count = 0; std::string font_faces; - std::string glyph_styles; // combined per-font `.fvN`/`.fnN`/`.gvN`/`.giN` + std::string glyph_styles; // per-font visible-glyph class `.fvN` std::vector accepted_fonts; - std::vector> used_unicode; - // Which combined per-font classes occur, so only those are emitted in - // . Slots: [0]=`.fvN` (plain visible), [1]=`.fnN` (plain invisible), - // [2]=`.gvN` (nested-glyph visible), [3]=`.giN` (nested-glyph invisible). - std::vector> font_class_used; std::unordered_map family_index; const auto font_family = [&](pdf::Font *font) -> std::uint32_t { const auto [it, inserted] = family_index.try_emplace(font, 0); @@ -682,31 +705,14 @@ class HtmlServiceImpl final : public HtmlService { const std::uint32_t index = ++family_count; it->second = index; accepted_fonts.push_back(font); - used_unicode.emplace_back(); - font_class_used.push_back({false, false, false, false}); return index; }; - // The combined per-font class carrying `font-family:'odr-fN'` and the paint - // colour, so a font-bearing span names one class instead of restating the - // font family (interned) plus `.gv`/`.i` on every one of the millions of - // spans. A `nested` glyph layer additionally folds in the `.t .g` placement - // (absolute at the run origin, unselectable). Records the combo as used so - // only the rules that occur are emitted in . - const auto font_class = [&](const std::uint32_t font, const bool inv, - const bool nested) { - const int slot = nested ? (inv ? 3 : 2) : (inv ? 1 : 0); - font_class_used[font - 1][slot] = true; - const char *const prefix = - nested ? (inv ? "gi" : "gv") : (inv ? "fn" : "fv"); - return prefix + std::to_string(font); - }; - - // A real-Unicode scalar may carry a `cmap` entry (letting its run collapse) - // only inside the BMP and outside the PUA (`U+E000..U+F8FF`), so a glyph's - // own deterministic PUA code point (`pua_code_point`) is never shadowed. - const auto collapsible_unicode = [](const char32_t c) { - return c <= 0xFFFF && !(c >= 0xE000 && c <= 0xF8FF); + // The per-font visible-glyph class `.fvN`, carrying `font-family:'odr-fN'` + // and the black paint, so a glyph span names one class instead of restating + // the font family on every one of the (potentially millions of) spans. + const auto font_class = [](const std::uint32_t font) { + return "fv" + std::to_string(font); }; // The PUA glyph string for a run: each character code -> glyph id -> @@ -796,6 +802,20 @@ class HtmlServiceImpl final : public HtmlService { GradientRegistry gradients(static_cast(pages_out.size())); PatternRegistry patterns(static_cast(pages_out.size())); + // Selection-layer grouping sweep state, in content-stream (reading) + // order. Tracks the previous text run's baseline and right edge (page-box + // points, y down) so the next run can be prefixed with a separator space + // on a line/column break or a wide intra-line gap. + bool have_prev_run = false; + double prev_baseline = 0; + double prev_end = 0; + bool prev_ends_space = false; + bool prev_was_matrix = false; + // Index of the line block currently being filled in `sel_lines`, or -1 + // before the first line opens. Runs append to its `runs`; a line/column + // break (or a matrix run) opens the next. + int cur_line = -1; + for (const pdf::PageElement &element : pdf::extract_page(stream, *page->resources, *m_logger)) { // A painted path: serialize its subpaths to an SVG `` fragment in @@ -870,20 +890,21 @@ class HtmlServiceImpl final : public HtmlService { const util::math::Transform2D m = flip_glyph * text.transform * to_box; - // Tr 3 (invisible) and Tr 7 (clip-only) paint nothing; keep them - // selectable via the transparent `.i` class. + // Tr 3 (invisible) and Tr 7 (clip-only) paint nothing; they emit no + // visual span at all and survive only in the selection layer (so + // OCR-over-scan text stays searchable/selectable). const bool invisible = text.rendering_mode == pdf::TextRenderingMode::invisible || text.rendering_mode == pdf::TextRenderingMode::clip; - // The run's visible paint colour, folded onto the visible span as an - // interned colour class — but only when it is not the default black, so - // the overwhelmingly common black run adds nothing. The per-font - // `.fvN`/`.gvN` classes declare `color:#000`; this class is emitted - // after them in (equal specificity), so it overrides. Invisible - // runs (Tr 3/7) stay transparent via `.i`, so they take no colour - // class. The fill modes paint with the non-stroking colour, the - // stroke-only modes (Tr 1/5) with the stroking colour. + // The run's visible paint colour, folded onto the visual glyph span as + // an interned colour class — but only when it is not the default black, + // so the overwhelmingly common black run adds nothing. The per-font + // `.fvN` class declares `color:#000`; this class is emitted after it in + // (equal specificity), so it overrides. The selection layer is + // transparent and takes no colour. The fill modes paint with the + // non-stroking colour, the stroke-only modes (Tr 1/5) with the stroking + // colour. std::string color_suffix; if (!invisible) { const pdf::GraphicsState::Color &paint = @@ -913,7 +934,12 @@ class HtmlServiceImpl final : public HtmlService { // the uniform branch, carried by the CSS matrix in the general branch // (so spacing there is expressed pre-transform, scale == 1). double scale; - if (m.b == 0 && m.c == 0 && m.a == m.d) { + // A rotated/skewed run takes the general (matrix) branch; an upright + // uniform run the left/top/font-size branch. Only the latter can flow + // inside a line block and be fit by `letter-spacing` (the fit measures + // on-screen width, which for a matrix box is a rotated bbox). + const bool is_matrix = !(m.b == 0 && m.c == 0 && m.a == m.d); + if (!is_matrix) { // Upright uniform scale: fold the scale into the font size and place // the origin with left/top, so the (otherwise near-universal) matrix // is dropped. The ascent shift is purely vertical here (local y maps @@ -940,6 +966,13 @@ class HtmlServiceImpl final : public HtmlService { scale = 1; } + // Placement-only class set (origin + font size), snapshot before the + // Tc/Tw spacing classes below. The selection line container uses this: + // the spacing is folded into each run's `data-w` advance and applied by + // the `letter-spacing` fit, so carrying a separate Tc `letter-spacing` + // would collide with it. The visual glyph layer keeps the full `base`. + const std::string place = base; + // PDF char/word spacing (Tc/Tw) translate directly to CSS. TJ kerning // needs no expression here: `extract_text` emits a separate segment per // TJ string and folds the adjustment into the following segment's @@ -974,94 +1007,141 @@ class HtmlServiceImpl final : public HtmlService { round2(text.word_spacing * scale * pt_to_px))); } - // A run collapses to a single span — selectable *and* visible, the real - // Unicode rendered directly in the embedded font — when it has an - // embedded font, carries text, is 1:1 with its codes (no /ToUnicode - // expansion, /ActualText, or inferred space), and every glyph wins a - // real-Unicode `cmap` entry. The winner of a scalar is the first - // collapse-candidate run (in document order) to use it; processing - // order *is* document order, so an earlier run's claim is already - // visible and no later run can unseat it — the decision is final here. - const bool collapse_candidate = - font != 0 && !text.text.empty() && text.font != nullptr && - util::string::utf8_length(text.text) == text.advances.size(); - - if (collapse_candidate) { - // Stake first-wins real-Unicode -> glyph claims and decide collapse - // in one walk: the run collapses iff each code's glyph wins (or - // matches) its scalar. Claims are staked for every collapsible scalar - // even when the run ends up dual, so later runs see them. The - // post-pass only bakes the won scalars into the shared font's `cmap`. - std::map &won = used_unicode[font - 1]; - bool collapse = true; - auto cp = text.text.begin(); - for (const std::uint32_t code : text.font->codes(text.codes)) { - const char32_t uchar = utf8::unchecked::next(cp); - const std::uint16_t glyph = text.font->glyph_for_code(code); - if (!collapsible_unicode(uchar)) { - collapse = false; - continue; + // --- Selection layer ------------------------------------------------- + // Every run with extractable text feeds the transparent, selectable + // layer (`.i`) with its real Unicode. Runs are grouped into per-line + // flow blocks: one absolutely-positioned container per PDF line (placed + // at the line's first run origin via `place`), whose inline run spans + // *flow*, so a native drag, double-click or find-in-page works within + // the line and the run boxes are real. A content-order sweep decides, + // per run, whether it opens a new line, extends the line with a fresh + // run, or merges into the previous run: + // + // * A line/column break (baseline jump, or x regressing left of the + // previous run's end) opens a new line block. The previous line is + // closed with a trailing space (deduped) so a phrase split across + // the break is still found/copied as "word1 word2"; that space + // carries no `data-w`, so the fit skips it and it renders past the + // last glyph (transparent, harmless). The inferred leading space a + // run often carries is dropped at a line start — it belongs to the + // break, already covered by the trailing space. + // * A wide intra-line gap, or a whitespace boundary, starts a fresh + // run within the same line. The inter-run gap rides on a separator + // span whose `data-w` *is* the gap width, so the flowed runs land at + // their true x-offsets (telescoping: a separator gap plus a run + // advance equals the next run's offset), and wide gaps — table + // columns on one baseline — are reproduced, not collapsed to a + // single space. The separator holds one U+0020, deduped against a + // space already ending the previous run (a doubled space breaks + // literal find-in-page); the inferred leading space is peeled onto + // it so each word run starts at its first glyph (a double-click, + // which excludes surrounding whitespace, then highlights the word + // without a space-width offset). + // * A tight same-baseline continuation with no whitespace at the + // boundary merges into the previous run. PDF splits one word into + // several runs at every TJ kerning adjustment, and the browser finds + // word boundaries within a single text node only, so folding the + // continuation keeps the whole word selectable as a unit; its + // advance extends the same run's fit target. + // + // A rotated/skewed (matrix) run cannot flow or be `letter-spacing`-fit + // (its on-screen box is a rotated bbox), so it gets its own single-run + // line block positioned by its own matrix and left unfitted (no + // `data-w`) — reproducing the old per-run absolute placement, with no + // flow benefit but no regression either. + if (!text.text.empty()) { + // Run origin and horizontal extent in page-box points (y down). The + // advance (`text.width`) lives in the text matrix's space; its box + // extent scales by the text-matrix -> box x-axis length. The + // placement transform's x-axis (`m.a`, `m.b`) additionally folds in + // horizontal scaling (Tz), but `text.width` already advanced with Tz + // in `segment_advances`; divide it back out so Tz is applied once + // (and so `font_pt` tracks the Tz-free em). + const double ox = m.e; + const double baseline = m.f; + const double tz = text.horizontal_scaling / 100.0; + const double axis = tz != 0 ? std::hypot(m.a, m.b) / tz : 0; + const double extent = text.width * axis; + const double font_pt = text.size * axis; + const bool starts_space = text.text.front() == ' '; + const double width_px = extent * pt_to_px; + // The fit target: 0 for a matrix run (skipped by the fit), else the + // run's true advance. + const double fit_w = is_matrix ? 0.0 : width_px; + // Inter-run gap in box px (only meaningful within a line). + const double gap_px = std::max(0.0, ox - prev_end) * pt_to_px; + + // Open a new line block on the first run, a matrix run (or just after + // one), or a detected line/column break. + bool new_line = !have_prev_run || is_matrix || prev_was_matrix; + bool gap = false; + if (have_prev_run && font_pt > 0 && !new_line) { + new_line = std::abs(baseline - prev_baseline) > 0.6 * font_pt || + ox < prev_end - 0.5 * font_pt; + gap = ox - prev_end > 0.25 * font_pt; + } + + std::string core = starts_space ? text.text.substr(1) : text.text; + + if (new_line) { + if (cur_line >= 0 && have_prev_run && !prev_ends_space) { + page_out.sel_lines[cur_line].runs.push_back(SpanOut{"", " ", 0}); + } + page_out.sel_lines.push_back(LineOut{place + " i", {}}); + cur_line = static_cast(page_out.sel_lines.size()) - 1; + if (!core.empty()) { + page_out.sel_lines[cur_line].runs.push_back( + SpanOut{"", escape_selection_text(std::move(core)), fit_w}); + } + } else if (gap || prev_ends_space || starts_space) { + // Fresh run within the line, gap carried by a separator span. + std::vector &runs = page_out.sel_lines[cur_line].runs; + if (!prev_ends_space && !runs.empty()) { + runs.push_back(SpanOut{"", " ", gap_px}); } - const auto [it, inserted] = won.emplace(uchar, glyph); - if (!inserted && it->second != glyph) { - collapse = false; + if (!core.empty()) { + runs.push_back( + SpanOut{"", escape_selection_text(std::move(core)), fit_w}); + } + } else { + // Tight, whitespace-free continuation: extend the previous run. + std::vector &runs = page_out.sel_lines[cur_line].runs; + if (!runs.empty()) { + runs.back().text += escape_selection_text(text.text); + runs.back().width += width_px; } } - if (collapse) { - // One span: the real Unicode rendered in the embedded font, named - // by the combined per-font class (black visible / transparent - // invisible), selectable either way. - std::string classes = std::move(base); - classes += ' '; - classes += font_class(font, invisible, /*nested=*/false); + + prev_baseline = baseline; + prev_end = ox + extent; + prev_ends_space = text.text.back() == ' '; + prev_was_matrix = is_matrix; + have_prev_run = true; + } + + // --- Visual layer ---------------------------------------------------- + // Unselectable glyphs in paint order. Invisible runs (Tr 3/7) paint + // nothing, so they emit no visual span — they live only in the + // selection layer above. + if (!invisible) { + if (font != 0) { + // PUA code points in the embedded font, carrying the placement + // (`base`), `.g` (unselectable) and the per-font paint+family + // class. + std::string classes = base + " g "; + classes += font_class(font); classes += color_suffix; page_out.items.push_back( - SpanOut{std::move(classes), escape_text(text.text), {}, {}}); + SpanOut{std::move(classes), + escape_text(glyph_run(*text.font, text.codes))}); } else { - // Dual layer (a glyph lost its scalar to an earlier one): a - // transparent selectable Unicode span with the PUA glyph layer - // nested inside, the latter folded into the combined `.gvN` / - // `.giN` class. The colour rides the visible (nested) layer. - page_out.items.push_back(SpanOut{ - base + " i", escape_text(text.text), - font_class(font, invisible, /*nested=*/true) + color_suffix, - escape_text(glyph_run(*text.font, text.codes))}); - } - } else if (font != 0) { - // The visible glyph layer: PUA code points in the embedded font, - // named by the combined per-font class (paint colour + font family). - std::string glyph_text = - escape_text(glyph_run(*text.font, text.codes)); - - if (!text.text.empty()) { - // Dual layer: a transparent selectable span carrying the real - // Unicode (for copy/search) with the glyph layer nested inside. - // The nested child overlays the run origin and inherits the - // placement via the combined `.gvN` / `.giN` class. - page_out.items.push_back(SpanOut{ - base + " i", escape_text(text.text), - font_class(font, invisible, /*nested=*/true) + color_suffix, - std::move(glyph_text)}); - } else { - // Display-only run: nothing is extractable (the `no_unicode` case), - // so the glyph layer stands alone and carries the placement itself - // (`base`), `.g` (unselectable) and the combined paint+font class. - std::string glyph_classes = base + " g "; - glyph_classes += font_class(font, invisible, /*nested=*/false); - glyph_classes += color_suffix; - page_out.items.push_back(SpanOut{ - std::move(glyph_classes), std::move(glyph_text), {}, {}}); - } - } else { - // Legacy single-layer path: no embedded font, render the Unicode in a - // fallback font. - std::string classes = base; - if (invisible) { - classes += " i"; + // No embedded font: render the Unicode in a fallback font, + // unselectable (the selection layer owns interaction). + std::string classes = base + " g"; + classes += color_suffix; + page_out.items.push_back( + SpanOut{std::move(classes), escape_text(text.text)}); } - classes += color_suffix; - page_out.items.push_back( - SpanOut{std::move(classes), escape_text(text.text), {}, {}}); } } @@ -1070,62 +1150,39 @@ class HtmlServiceImpl final : public HtmlService { page_out.clip_defs = clips.defs() + gradients.defs() + patterns.defs(); } - // Post-pass: every page has been scanned, so the per-font used-scalar sets - // are complete. - // - // Re-encode each accepted font with its real-Unicode entries baked into the - // `cmap` (the PUA range is kept as a fallback) and emit the `@font-face` - // rules in index order, so the output stays deterministic. + // Post-pass: re-encode each accepted font to the PUA and emit its + // `@font-face` rule plus the per-font visible-glyph class in index order, + // so the output stays deterministic. The visual glyph layer renders PUA + // code points only (selection rides the separate transparent layer), so no + // real-Unicode `cmap` entries are baked. for (std::uint32_t i = 0; i < family_count; ++i) { pdf::Font *font = accepted_fonts[i]; - const std::map &extra = used_unicode[i]; std::string reencoded; if (auto sfnt = std::dynamic_pointer_cast( font->embedded_font)) { - font::reencode_to_pua(*sfnt, extra); + font::reencode_to_pua(*sfnt); std::ostringstream sfnt_out; sfnt->write(sfnt_out); reencoded = std::move(sfnt_out).str(); } else if (auto cff = std::dynamic_pointer_cast( font->embedded_font)) { - reencoded = font::cff::wrap_to_otf(*cff, extra); + reencoded = font::cff::wrap_to_otf(*cff); } const std::string url = file_to_url(reencoded, "font/ttf"); - font_faces += "@font-face{font-family:'odr-f" + std::to_string(i + 1) + - "';src:url(" + url + ");}"; - - // The combined per-font classes for this font, only those used. `.fvN` / - // `.fnN` carry just the paint colour and font family (placement stays on - // the span's own classes); `.gvN` / `.giN` additionally fold in the - // nested glyph layer's `.t` placement and `.g` unselectability. const std::string n = std::to_string(i + 1); - const std::string family = "font-family:'odr-f" + n + "'"; - constexpr const char *placement = - "position:absolute;left:0;top:0;transform-origin:0 0;" - "white-space:pre;line-height:1;user-select:none;"; - const auto rule = [&](const char *cls, const char *head, - const char *color) { - glyph_styles += '.'; - glyph_styles += cls; - glyph_styles += n; - glyph_styles += '{'; - glyph_styles += head; - glyph_styles += color; - glyph_styles += family; - glyph_styles += '}'; - }; - if (font_class_used[i][0]) { - rule("fv", "", "color:#000;"); - } - if (font_class_used[i][1]) { - rule("fn", "", "color:transparent;"); - } - if (font_class_used[i][2]) { - rule("gv", placement, "color:#000;"); - } - if (font_class_used[i][3]) { - rule("gi", placement, "color:transparent;"); - } + font_faces += "@font-face{font-family:'odr-f"; + font_faces += n; + font_faces += "';src:url("; + font_faces += url; + font_faces += ");}"; + + // `.fvN` carries the font family and the black paint; placement (`.t`), + // unselectability (`.g`) and any non-black colour stay on the span. + glyph_styles += ".fv"; + glyph_styles += n; + glyph_styles += "{color:#000;font-family:'odr-f"; + glyph_styles += n; + glyph_styles += "'}"; } // Pass 2: write the document, now that the catalog is complete. @@ -1148,12 +1205,11 @@ class HtmlServiceImpl final : public HtmlService { out.out() << ".p{position:relative;margin:16px auto;background:#fff;" "box-shadow:0 1px 4px rgba(0,0,0,.5)}"; // `font-kerning:none` + `font-variant-ligatures:none` keep the browser from - // applying the embedded font's GPOS/GSUB tables. A collapsed run now emits - // real Unicode in that font, so without this a sequence like `fi`/`AV` - // could be re-shaped (ligature substitution, kerning) after this code - // already fixed the PDF glyph IDs and advances, shifting pixels and run - // widths for otherwise 1:1 text. The PUA glyph layer was immune; restore - // that here. + // applying the embedded font's GPOS/GSUB tables: the PUA glyph layer + // carries exact PDF glyph IDs and advances, and ligature substitution / + // kerning would re-shape it, shifting pixels and run widths. Shared by both + // layers + // (`.t`). // `line-height:1` fixes the box top one em-ascent above the baseline so the // baseline shift applied to each run's `top`/matrix (see `ascent_em`) lands // the glyphs on the PDF text origin; the browser's default `normal` leading @@ -1161,12 +1217,11 @@ class HtmlServiceImpl final : public HtmlService { out.out() << ".t{position:absolute;left:0;top:0;transform-origin:0 0;" "white-space:pre;line-height:1;font-kerning:none;" "font-variant-ligatures:none}"; - // Invisible text render modes (Tr 3/7): kept in the DOM for selection and - // search (OCR-over-scan), but not painted. + // The selection layer: transparent (painted by the glyph layer underneath) + // but selectable and searchable, including OCR-over-scan invisible text. out.out() << ".i{color:transparent}"; - // The display-only glyph layer (`no_unicode` runs) is not selectable, so - // the PUA code points stay off the clipboard; `.g` pairs with a combined - // `.fvN`/`.fnN` paint+font class on those spans. + // The visual glyph layer is not selectable — selection rides the `.i` + // layer, so the (often PUA) visible code points stay off the clipboard. out.out() << ".g{user-select:none}"; // Vector graphics: one or more `` overlays per page, each filling the // page box (viewBox in PDF points). `overflow:hidden` clips each overlay to @@ -1183,29 +1238,37 @@ class HtmlServiceImpl final : public HtmlService { "overflow:hidden;pointer-events:none}"; // Embedded fonts, re-encoded to the PUA and served inline. out.out() << font_faces; - // Combined per-font classes (`.fvN`/`.fnN` paint+font, `.gvN`/`.giN` also - // placement), so a font-bearing span names one class for its font. + // Per-font visible-glyph classes (`.fvN` paint+font family), so a glyph + // span names one class for its font. out.out() << glyph_styles; - // Per-value atomic classes (font sizes, offsets, transforms, ...). + // Per-value atomic classes (font sizes, offsets, transforms, ...). Shared + // by the visual glyph layer and the selection layer (both anchor at the run + // origin via these placement classes). styles.write_rules(out.out()); out.write_header_style_end(); out.write_header_end(); const auto write_span = [&out](const SpanOut &span) { - // Inline so the whole run (and its nested glyph layer) stays on one line: - // smaller output and a more legible diff than the open/text/close split, - // while each run still gets its own line under the page div. - out.write_element_begin( - "span", - HtmlElementOptions().set_inline(true).set_class(span.classes)); - out.write_raw(span.text); - if (!span.glyph_classes.empty()) { - out.write_element_begin("span", - HtmlElementOptions().set_inline(true).set_class( - span.glyph_classes)); - out.write_raw(span.glyph_text); - out.write_element_end("span"); + // Inline so the run stays on one line: smaller output and a more legible + // diff than the open/text/close split, while each run still gets its own + // line under the page div. + HtmlElementOptions options; + options.set_inline(true); + // Inline selection run spans carry no class (placement and transparency + // are inherited from the line container); everything else names classes. + if (!span.classes.empty()) { + options.set_class(span.classes); + } + // Selection spans carry their true advance (px) for the fit script. + std::string data_w; + if (span.width > 0) { + std::ostringstream w; + w << "data-w=\"" << round2(span.width) << "\""; + data_w = std::move(w).str(); + options.set_extra(data_w); } + out.write_element_begin("span", options); + out.write_raw(span.text); out.write_element_end("span"); }; @@ -1213,8 +1276,19 @@ class HtmlServiceImpl final : public HtmlService { for (const PageOut &page : pages_out) { out.write_element_begin("div", HtmlElementOptions().set_class(page.classes)); - // Clip-path and gradient defs for this page, in a hidden zero-size - // ``. They are referenced by id from the page's fragments; + + // Visual layer: the page's graphics and unselectable glyphs, grouped in + // one parent and hidden from the accessibility tree (`aria-hidden`) — the + // glyphs are often PUA code points a screen reader would read as + // gibberish, and the real text is carried by the selection layer below. + // The wrapper is unpositioned and contributes no height (its children are + // `position:absolute`), so it stays layout-neutral and the spans still + // anchor to the `.p` page box. + out.write_element_begin("div", + HtmlElementOptions().set_class("vis").set_extra( + R"(aria-hidden="true")")); + // Clip-path, gradient and pattern defs for this page, in a hidden + // zero-size ``. They are referenced by id from the page's fragments; // `clipPathUnits`/`gradientUnits` are `userSpaceOnUse`, so the geometry // is read in the user space of the referencing element (the page // viewBox), not this ``. @@ -1252,7 +1326,69 @@ class HtmlServiceImpl final : public HtmlService { } close_svg(); out.write_element_end("div"); + + // Selection layer: transparent, selectable Unicode in reading order, + // grouped in its own parent and emitted after the visual layer so the + // spans are contiguous in the DOM and a drag- or find-selection flows + // cleanly across runs and lines without the visual glyphs (which are + // `user-select:none`) interrupting it. + out.write_element_begin("div", HtmlElementOptions().set_class("sel")); + for (const LineOut &line : page.sel_lines) { + // One absolutely-positioned container per PDF line; its run spans flow + // inline, so selection/find/double-click work natively across them. + // `set_inline` stops the writer from emitting newlines + indent + // *between* the run spans: the container carries `white-space:pre` + // (from + // `.t`), so that formatting whitespace would otherwise render as real + // text and shove the runs onto a new line / indent them. + out.write_element_begin( + "div", + HtmlElementOptions().set_inline(true).set_class(line.classes)); + for (const SpanOut &run : line.runs) { + write_span(run); + } + out.write_element_end("div"); + } + out.write_element_end("div"); + + out.write_element_end("div"); } + + // Selection-fit script. The transparent run spans render real Unicode in + // the browser's system font, whose advances differ from the embedded + // glyphs, so an active highlight is wider or narrower than the visible run. + // Each fitted span carries its true advance in `data-w` (CSS px); correct + // its box with `letter-spacing = (target - measured) / glyph_count`. Unlike + // the old `scaleX`, `letter-spacing` is consumed *during* layout, so the + // box actually grows/shrinks and the following run flows from the corrected + // edge — what makes the per-line flow blocks land. Negative values squeeze + // a too-wide run. The page is fully usable without this — it only tightens + // the highlight rectangle and the within-line x-offsets, so it degrades + // gracefully where scripts are blocked. + // + // Run per page, lazily, via `IntersectionObserver`: a large document only + // pays for the pages actually scrolled into view, never a single + // whole-document pass on load. Within a page, read every width first and + // write every `letter-spacing` second so the measurement loop isn't + // interleaved with style writes (which would force a reflow per span); a + // run's own width is independent of its siblings' spacing, so one + // measure-then-write pass fits them all and the cumulative offsets resolve. + // Matrix (rotated/skewed) runs carry no `data-w` and are skipped. + out.write_script_begin(); + out.write_raw( + R"JS((function(){if(!window.IntersectionObserver)return;)JS" + R"JS(var io=new IntersectionObserver(function(es){es.forEach(function(e){)JS" + R"JS(if(!e.isIntersecting)return;io.unobserve(e.target);)JS" + R"JS(var s=e.target.querySelectorAll('.sel span[data-w]'),n=s.length,)JS" + R"JS(w=new Array(n),i,c,d;)JS" + R"JS(for(i=0;i0&&w[i]>0){)JS" + R"JS(d=parseFloat(s[i].getAttribute('data-w'));)JS" + R"JS(s[i].style.letterSpacing=((d-w[i])/c)+'px';}}})},{rootMargin:'200px'});)JS" + R"JS(document.querySelectorAll('.p').forEach(function(p){io.observe(p);});})();)JS", + false); + out.write_script_end(); + out.write_body_end(); out.write_end(); diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 1a042398..0267de6a 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -385,6 +385,66 @@ recoverable Unicode are additionally marked non-extractable (`user-select: none` a rendering risk — such PDFs look right, their text just isn't selectable until the tables land. +**Selection layer separate from the glyph layer (decision 2026-06).** Selection +and find-in-page used to be poor because every show-text segment became one +absolutely-positioned transparent span at its run origin: runs sharing a line +were independent boxes with no whitespace or reading order between them, so a +phrase — or even one word split across `TJ` kerning runs — crossing a run +boundary was unfindable and a drag jumped between boxes. Fix: keep the **visual +glyph layer exactly as is** (absolutely-positioned PUA spans — what makes +rendering pixel-perfect) and restructure **only** the transparent Unicode into a +separate **selection layer** (`PageOut::sel_lines`, transparent via `.i`), +emitted contiguously *after* the visual content in content-stream order so a +native drag or Ctrl+F flows through it without an unselectable glyph span +(`.g`, `user-select:none`) interrupting. PDF.js-style layering, done statically +at generation time. Key points: +- **Content-stream-order sweep, never a global re-sort.** Content order is + almost always reading order (a producer paints a column top-to-bottom, then the + next); a global (baseline, x) sort would interleave columns sharing a y-band + and scramble multi-column text and tables. An O(n) sweep tracks the previous + run's baseline and right edge and decides each run's boundary. +- **Eager to split, conservative to merge.** A new *line block* opens when the + baseline jumps (>0.6·font-size) or the run starts left of the previous run's + end; within a line, a gap exceeding 0.25·font-size (or a whitespace boundary) + starts a fresh run, otherwise a tight, whitespace-free same-baseline + continuation **merges** into the previous run — PDF splits one word at every + `TJ` kern and the browser finds word boundaries only within a single text + node, so folding the continuation keeps the whole word selectable. A single + space is inserted at every break (so `"the quick"` matches across it): a + separator span within the line, or the previous line's trailing space across a + line break, deduped against whitespace the run already carries (a doubled + space breaks literal find-in-page). Cells never merge across columns, so + tables fall out as separate runs (correct selection) with **no table + detection**. +- **Per-line flow blocks + an on-load `letter-spacing` fit (the one non-JS-free + bit).** Each PDF line is one absolutely-positioned container (placed at its + first run's origin, reusing the glyph-layer placement *minus* the Tc/Tw + spacing classes); its run ``s **flow inline** rather than being + individually positioned, so a native drag, double-click and find-in-page work + within the line and the run boxes are *real*. Horizontal placement within the + line is purely cumulative — each separator span's `data-w` is the inter-run + gap, so word advances and gaps telescope to each run's true x-offset (wide + table-column gaps reproduced, not collapsed). The transparent text renders in + a system fallback font with its own advances, so a tiny on-load JS script fits + each run (carrying its true advance as `data-w`) with + `letter-spacing = (target − measured) / glyph_count` — negative to squeeze a + too-wide run. Unlike the old per-run `scaleX`, `letter-spacing` is consumed + *during* layout, so the box grows/shrinks and the following run flows from the + corrected edge (that is why the flow blocks work). A rotated/skewed (matrix) + run cannot flow or be fit (its on-screen box is a rotated bbox), so it keeps + its own single-run line block positioned by its matrix and carries no `data-w` + (the fit skips it) — reproducing the old per-run absolute placement. Output is + **no longer fully JS-free**; visual rendering stays byte-for-byte unchanged, + only the selection layer changed. + - **Known follow-ups.** Vertical placement within a line still rides each + run's shared baseline, but a line block assumes ~uniform leading; mixed font + sizes in one line (sub/superscripts) align by their own baseline but the + container's box height tracks the first run's size (cosmetic highlight drift + only). Cross-line find-in-page depends on the browser treating the trailing + space + block boundary as a single space. The next rung — grouping lines + into paragraph blocks for native cross-line selection — is deferred (needs + layout analysis with a confidence fallback; see the chat that scoped this). + --- ## Tests @@ -657,6 +717,15 @@ tree, little else. CID → Unicode tables (large external data; the generator scaffolding in `tools/pdf/generate_cid_data.py` is landed, the storage decision and lookup remain). +- **Selection-layer refinements** (deferred from the selection-layer work): no + **de-hyphenation** — a line-final hyphen (`"infor-\nmation"`) stays unfindable + as `"information"`, since auto-joining is genuinely ambiguous (soft break + hyphen vs. a real `well-known`; PDF almost never marks the difference, only the + rare `U+00AD` is unambiguous) and lossy enough to hurt copy fidelity — revisit + as an opt-in / `U+00AD`-only heuristic. Also: gap-based word separators within + a line beyond the producer's inferred spaces (only if word-merging shows up in + practice), and richer static structure recovery (semantic `` / + multi-column markup) — a separate, larger layout-analysis effort. - **Bidi & vertical writing** (deferred): RTL run reordering for the layout/selection order, and vertical writing mode (`Identity-V`/CJK — the `/W2`/`/DW2` vertical metrics and a perpendicular pen advance, which the diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 85a14d01..6c5c7607 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 85a14d010ffb87dddeb67cdc1aa18bd54d502c47 +Subproject commit 6c5c760724c092824107e94d736427e5d418c8dc diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index 45b29f5b..3e1a28cb 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit 45b29f5b796bda9ad0c14661179e50f91f47aecc +Subproject commit 3e1a28cbc2f95c908c0a9b3966d4420c9995d393