diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..2002804d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +# Saved Google result pages are fixtures, preserve their +# exact bytes/EOLs so the byte-for-byte expected-array match +files/*.html -text +spec/fixtures/*.html -text diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..5be63fcb --- /dev/null +++ b/.rspec @@ -0,0 +1,2 @@ +--require spec_helper +--format documentation diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 00000000..4ac74de9 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,16 @@ +AllCops: + TargetRubyVersion: 3.2 + NewCops: enable + SuggestExtensions: false + Exclude: + - "files/**/*" + - "spec/fixtures/**/*" + +# Match the existing code (double quotes throughout). +Style/StringLiterals: + EnforcedStyle: double_quotes + +# Long describe/context blocks are idiomatic in RSpec. +Metrics/BlockLength: + Exclude: + - "spec/**/*" diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..f239c276 --- /dev/null +++ b/Gemfile @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "nokogiri", "~> 1.19" + +group :test do + gem "rspec", "~> 3.13" +end + +group :development, :test do + gem "rubocop", "~> 1.88" +end diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..fc6affb4 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,84 @@ +GEM + remote: https://rubygems.org/ + specs: + ast (2.4.3) + diff-lcs (1.6.2) + json (2.19.9) + language_server-protocol (3.17.0.5) + lint_roller (1.1.0) + nokogiri (1.19.4-arm64-darwin) + racc (~> 1.4) + parallel (1.28.0) + parser (3.3.11.1) + ast (~> 2.4.1) + racc + prism (1.9.0) + racc (1.8.1) + rainbow (3.1.1) + regexp_parser (2.12.0) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + rubocop (1.88.0) + json (~> 2.3) + language_server-protocol (~> 3.17.0.2) + lint_roller (~> 1.1.0) + parallel (>= 1.10) + parser (>= 3.3.0.2) + rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 2.9.3, < 3.0) + rubocop-ast (>= 1.49.0, < 2.0) + ruby-progressbar (~> 1.7) + unicode-display_width (>= 2.4.0, < 4.0) + rubocop-ast (1.49.1) + parser (>= 3.3.7.2) + prism (~> 1.7) + ruby-progressbar (1.13.0) + unicode-display_width (3.2.0) + unicode-emoji (~> 4.1) + unicode-emoji (4.2.0) + +PLATFORMS + arm64-darwin-25 + +DEPENDENCIES + nokogiri (~> 1.19) + rspec (~> 3.13) + rubocop (~> 1.88) + +CHECKSUMS + ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383 + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + json (2.19.9) sha256=9b9025b7cdddafa38d316eca0b2358488e42d417045c1b90d216a9fefe46b79a + language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc + lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87 + nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + parallel (1.28.0) sha256=33e6de1484baf2524792d178b0913fc8eb94c628d6cfe45599ad4458c638c970 + parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54 + prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85 + racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a + regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + rubocop (1.88.0) sha256=e420ddf1662d0ef34bc8a2910ac4b396a7ddda0b51a708264405241734b08e0b + rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035 + ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33 + unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42 + unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f + +BUNDLED WITH + 4.0.10 diff --git a/README.md b/README.md index 4d5a093f..9df25dab 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,128 @@ Parse directly the HTML result page ([html file]) in this repository. No extra H [html file]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/van-gogh-paintings.html [expected array]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/expected-array.json -Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). +Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.) The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want. + +--- + +## Solution + +Parses the Knowledge Graph carousel out of a saved Google results page into an +array of `{ name, extensions, link, image }` objects. + +The Van Gogh paintings case is the required deliverable and is covered. The same code handles other entity types (albums, buildings, cast) because only the *locator* changes per type. + +### Running it + +```sh +bundle install +bundle exec rspec # specs +bundle exec rubocop # lint +``` + +Run it against any saved results page (prints JSON to stdout): + +```sh +ruby -Ilib -rcarousel_extractor -rjson \ + -e 'puts JSON.pretty_generate(CarouselExtractor.call(File.read(ARGV[0])))' \ + files/van-gogh-paintings.html +``` + +### Output + +An array of symbol-keyed hashes, in the key order of `files/expected-array.json`. +The run command above pretty-prints for readability; the byte-for-byte claim is +about compact `to_json` — serialized that way, the Van Gogh case is identical to +the expected output: + +```json +{ "name": "The Starry Night", "extensions": ["1889"], "link": "https://www.google.com/search?...", "image": "data:image/jpeg;base64,..." } +``` + +`extensions` is omitted entirely when a tile has no secondary line (e.g. +yearless paintings). The only field we *know* the meaning of is the paintings +date (from the expected fixture); for other types `extensions` carries +whatever the tile's second line is (year, or a character name for a cast +carousel). + +### Approach / design notes + +- **Locate by stable schema, not styling.** Tiles are found via the Knowledge + Graph `data-attrid` (e.g. `kc:/visual_art/visual_artist:works`), never by + minified classes, `jsname`, or per-request ids, those are not stable. +- **Allowlist of carousel tags, not "any tile container."** Scoping extraction to + a known carousel container keeps off-target tiles out. The Grateful Dead albums + page is the clearest case: alongside the 12 album tiles it carries eBay/Target + shopping thumbnails that *also* wrap an `` in a `/search?q=…` anchor. A + matcher keying only on "image+text tiles linking to /search" scrapes those two + in as extra entries (14 instead of 12); scoping to the + `kc:/music/artist:albums` container ignores them. Other off-target shapes the + allowlist skips: a non-entity strip (`kc:/common/topic:social media presence` + on the Unilever page) or an *entity* carousel of a type we haven't validated + (`kc:/business/business_operation:founder`, a company's founders). To support a + new type, add its tag to `CAROUSEL_ATTRIDS` plus a fixture and a spec. +- **Per-tile extraction depends on structure.** `name` and `extensions` come from + the leaf text `
`s under each anchor (name from the first div, falling back to + `img@alt`); `link` from the anchor. The split is positional — the first leaf is + the name and any leaf after it becomes an extension — so a tile with extra + decorative text would leak into `extensions`. Across the current fixtures each + tile has at most one secondary line, so this stays clean. +- **Thumbnails without extra requests.** The first tiles render a placeholder + `` whose real bytes arrive later in the page: searching the HTML for the + base64 string from `expected-array.json` led to `_setImagesSrc(...)` `

Search Results

Breaking Bad
2008 ‧ Drama ‧ 5 seasons
Google apps
\ No newline at end of file diff --git a/spec/fixtures/frank_lloyd_wright_buildings.html b/spec/fixtures/frank_lloyd_wright_buildings.html new file mode 100644 index 00000000..e2421236 --- /dev/null +++ b/spec/fixtures/frank_lloyd_wright_buildings.html @@ -0,0 +1,57 @@ +frank lloyd wright buildings - Google Search

Search Results

Frank Lloyd Wright
American architect and designer
Google apps
\ No newline at end of file diff --git a/spec/fixtures/grateful_dead_albums.html b/spec/fixtures/grateful_dead_albums.html new file mode 100644 index 00000000..4868bb29 --- /dev/null +++ b/spec/fixtures/grateful_dead_albums.html @@ -0,0 +1,58 @@ +grateful dead albums - Google Search

Search Results

Grateful Dead
Rock band
Google apps
\ No newline at end of file diff --git a/spec/fixtures/mark_gonzales_skateboard_art.html b/spec/fixtures/mark_gonzales_skateboard_art.html new file mode 100644 index 00000000..1e05fdf0 --- /dev/null +++ b/spec/fixtures/mark_gonzales_skateboard_art.html @@ -0,0 +1,51 @@ +mark gonzales skateboard art - Google Search

Search Results

Sponsored products

Vision Mark Gonzales Modern Concave 10" Skateboard Deck, White
$89.95
The Dark Slide
The Dark Slide Vision Mark Gonzales Modern Concave 10" Skateboard Deck, Orange
$89.95
Shop app
Krooked Mark Gonz Gonzales Sweatpants 9.81" Old School Skateboard Deck
$159.99
eBay
The Dark Slide Vision Mark Gonzales "Original MG" LTD Two Tone 10" Skateboard Deck, Red/Yellow
$99.95
Shop app
Vision Mark Gonzales Modern Concave 10" Skateboard Deck, Orange
$89.95
The Dark Slide
4.5" Vision Mark Gonzales vinyl sticker. 80's Vintage style skateboard decal.
$2.95
Etsy
Google apps
Search Labs
Google Account
Jason Dinsmore
dinjas@gmail.com
\ No newline at end of file diff --git a/spec/fixtures/unilever_brands.html b/spec/fixtures/unilever_brands.html new file mode 100644 index 00000000..72c85bf8 --- /dev/null +++ b/spec/fixtures/unilever_brands.html @@ -0,0 +1,64 @@ +unilever brands - Google Search

Search Results

AI Overview
Unilever owns over 400 brands worldwide, categorized into Beauty & Wellbeing, Personal Care, Home Care, and Nutrition. Their flagship products include Dove, Hellmann's, Knorr, Axe, and Vaseline, which are household staples across North America and globally.
🧴 Beauty & Wellbeing
  • Dove & Dove Men+Care: Soaps, body washes, and hair care.
  • Paula’s Choice: Science-backed skincare and exfoliants.
  • Liquid I.V.: Hydration and wellness powders.
  • Nutrafol: Dermatologist-recommended hair growth supplements.
  • Other Brands: TRESemmé, Vaseline, Nexxus, Dermalogica, SheaMoisture, and Simple.
🧼 Personal Care
  • Our brands | Unilever
    Nutrafol. The No.1 dermatologist-recommended hair growth supplement brand in the US. OMO. Dirt Is Good. Paula's Choice. Beauty beg...
    Unilever
  • Brands | Unilever
    Domestos. Unstoppable. Dove. Change beauty into a positive experience for every woman and the next generation. Hellmann's. Knorr. ...
    Unilever
  • List of Unilever brands - Wikipedia
    Condiments and extracts. Amino – food products (Poland) Amora – French mayonnaise and dressings (France, Belgium and Morocco) Arom...
    Wikipedia
Show all
Show more
Google apps
\ No newline at end of file diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb new file mode 100644 index 00000000..32dbfad7 --- /dev/null +++ b/spec/lib/carousel_extractor_spec.rb @@ -0,0 +1,230 @@ +# frozen_string_literal: true + +RSpec.describe CarouselExtractor do + let(:artworks) { described_class.call(fixture_input) } + let(:fixture_input) { File.read("#{FILES}/van-gogh-paintings.html") } + let(:fixture_expected) { File.read("#{FILES}/expected-array.json") } + let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } + + def entry_looks_valid?(entry) + entry[:name].to_s != "" && + entry[:link].to_s.start_with?("https://www.google.com/search") && + entry[:image].to_s.start_with?("data:image") + end + + describe "van-gogh-paintings.html (challenge fixture)" do + it "reproduces the expected artworks array exactly" do + expect(artworks.to_json).to eql(expected.to_json) + end + + describe "first artwork" do + subject(:first) { artworks.first } + + let(:starry_night) { expected.first } + + it("has a name") { expect(first[:name]).to eql(starry_night["name"]) } + it("has a link") { expect(first[:link]).to eql(starry_night["link"]) } + it("has extensions") do + expect(first[:extensions]).to eql(starry_night["extensions"]) + end + it("has an inline base64 image") do + expect(first[:image]).to eql(starry_night["image"]) + end + end + + it "omits extensions for yearless paintings rather than emitting []" do + yearless = artworks.reject { |a| a.key?(:extensions) } + expect(yearless.map { |a| a[:name] }).to include("Sunflowers") + expect(yearless).to all(satisfy { |a| !a.key?(:extensions) }) + end + + it "needs no extra HTTP requests (every image is inline data: or an in-page URL)" do + expect(artworks).to all(satisfy { |a| + a[:image].start_with?("data:image", "https://") + }) + end + end + + # Mirrors the per-artwork assertions from SerpApi's referenced Monet spec + # (which also hits the live API and covers the whole knowledge graph). + describe "conforms to SerpApi's referenced artwork contract" do + it "returns a non-empty artworks Array" do + expect(artworks).to be_an(Array) + expect(artworks).to_not be_empty + end + + it "first artwork has name/extensions/link/image of the expected types" do + first = artworks.first + expect(first[:name]).to be_a(String) + expect(first[:name]).to_not be_empty + expect(first[:extensions]).to be_a(Array) + expect(first[:extensions]).to_not be_empty + expect(first[:link]).to be_a(String) + expect(first[:link]).to_not be_empty + expect(first[:image]).to be_a(String) + expect(first[:image]).to_not be_empty + end + end + + describe "alternate carousel type: Grateful Dead albums" do + let(:albums) do + described_class.call(File.read("#{FIXTURES}/grateful_dead_albums.html")) + end + + it "extracts the albums carousel via the music attrid" do + expect(albums.size).to eql(12) + end + + it "fills name (from the text div) + link + image for every album" do + expect(albums).to all(satisfy(&method(:entry_looks_valid?))) + end + + it "captures release years as extensions" do + blues = albums.find { |a| a[:name] == "Blues for Allah" } + expect(blues[:extensions]).to eql(["1975"]) + end + end + + describe "alternate carousel type: Frank Lloyd Wright buildings (no dates)" do + let(:buildings) { described_class.call(building_fixture) } + let(:building_fixture) do + File.read("#{FIXTURES}/frank_lloyd_wright_buildings.html") + end + + it "extracts the buildings carousel via the architect attrid" do + expect(buildings.size).to eql(12) + end + + it "omits extensions across the entire carousel" do + expect(buildings).to all(satisfy { |a| !a.key?(:extensions) }) + end + + it "still fills name + link + base64 image for every building" do + expect(buildings).to all(satisfy(&method(:entry_looks_valid?))) + end + end + + describe "alternate carousel type: Breaking Bad cast" do + let(:cast) { described_class.call(cast_fixture) } + let(:cast_fixture) { File.read("#{FIXTURES}/breaking_bad_cast.html") } + + it "extracts the cast carousel via the tv_program attrid" do + expect(cast.size).to eql(8) + end + + it "puts the actor in name and the character in extensions" do + cranston = cast.find { |a| a[:name] == "Bryan Cranston" } + expect(cranston[:extensions]).to eql(["Walter White"]) + end + + it "fills name + link + base64 image for every cast member" do + expect(cast).to all(satisfy(&method(:entry_looks_valid?))) + end + end + + describe "non-carousel / wrong-module pages (negatives: locator must not false-positive)" do + it "returns [] for an organic/ads SERP (Mark Gonzales)" do + html = File.read("#{FIXTURES}/mark_gonzales_skateboard_art.html") + expect(described_class.call(html)).to eql([]) + end + + # Unilever's only carousel-shaped module ("social media presence") isn't an entity collection. + it "returns [] when the only carousel-shaped module is not an entity collection (Unilever)" do + html = File.read("#{FIXTURES}/unilever_brands.html") + expect(described_class.call(html)).to eql([]) + end + end + + describe "per-tile guards" do + it "drops anchors that lack an image or an href, keeping only real tiles" do + html = <<~HTML +
+ + Real Album
Real Album
+
+ More results + No href +
+ HTML + + expect(described_class.call(html).map { |e| e[:name] }) + .to eql(["Real Album"]) + end + + it "skips the data:image/gif placeholder in favor of the in-page data-src url" do + html = <<~HTML +
+ + Lazy Tile +
Lazy Tile
+
+
+ HTML + + entry = described_class.call(html).first + expect(entry[:image]) + .to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:lazy") + end + + it "handles an img with no src attribute, falling back to data-src" do + html = <<~HTML +
+ + Srcless Tile +
Srcless Tile
+
+
+ HTML + + entry = described_class.call(html).first + expect(entry[:image]) + .to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:srcless") + end + end + + describe "page with more than one allowlisted carousel" do + # A polymath like Leonardo da Vinci matches several allowlisted attrids at + # once (architect + visual artist). When more than one is genuinely + # populated we return every carousel's tiles concatenated, in + # CAROUSEL_ATTRIDS order, rather than guessing which single one was wanted. + it "returns every populated carousel's tiles, in CAROUSEL_ATTRIDS order" do + html = <<~HTML +
+ + A building
A building
+
+
+
+ + Mona Lisa
Mona Lisa
+
+ + The Last Supper
The Last Supper
+
+
+ HTML + + expect(described_class.call(html).map { |e| e[:name] }) + .to eql(["A building", "Mona Lisa", "The Last Supper"]) + end + + # The architecture block here holds no image tiles (da Vinci's real shape), + # so it contributes nothing and only the paintings come back. + it "ignores a matched carousel that holds no image tiles" do + html = <<~HTML +
+ A building +
+
+ + Mona Lisa
Mona Lisa
+
+
+ HTML + + expect(described_class.call(html).map { |e| e[:name] }).to eql(["Mona Lisa"]) + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..65cde27a --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +require "json" +require_relative "../lib/carousel_extractor" + +FILES = File.expand_path("../files", __dir__) +FIXTURES = File.expand_path("fixtures", __dir__)