diff --git a/404.html b/404.html deleted file mode 100644 index 086a5c9..0000000 --- a/404.html +++ /dev/null @@ -1,25 +0,0 @@ ---- -permalink: /404.html -layout: default ---- - - - -
-

404

- -

Page not found :(

-

The requested page could not be found.

-
diff --git a/Gemfile b/Gemfile deleted file mode 100644 index f11e16b..0000000 --- a/Gemfile +++ /dev/null @@ -1,35 +0,0 @@ -source "https://rubygems.org" -# Hello! This is where you manage which Jekyll version is used to run. -# When you want to use a different version, change it below, save the -# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: -# -# bundle exec jekyll serve -# -# This will help ensure the proper Jekyll version is running. -# Happy Jekylling! -# gem "jekyll", "~> 4.3.2" -# This is the default theme for new Jekyll sites. You may change this to anything you like. -# gem "hacker", "~> 0.2.0" -# If you want to use GitHub Pages, remove the "gem "jekyll"" above and -# uncomment the line below. To upgrade, run `bundle update github-pages`. -# If you have any plugins, put them here! -group :jekyll_plugins do - gem "jekyll-feed", "~> 0.12" - gem "github-pages", "~> 228" -end - -# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem -# and associated library. -platforms :mingw, :x64_mingw, :mswin, :jruby do - gem "tzinfo", ">= 1", "< 3" - gem "tzinfo-data" -end - -# Performance-booster for watching directories on Windows -gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] - -# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem -# do not have a Java counterpart. -gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby] - -gem "webrick", "~> 1.8" diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index d7b1808..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,265 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (7.0.5) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 1.6, < 2) - minitest (>= 5.1) - tzinfo (~> 2.0) - addressable (2.8.4) - public_suffix (>= 2.0.2, < 6.0) - coffee-script (2.4.1) - coffee-script-source - execjs - coffee-script-source (1.11.1) - colorator (1.1.0) - commonmarker (0.23.9) - concurrent-ruby (1.2.2) - dnsruby (1.70.0) - simpleidn (~> 0.2.1) - em-websocket (0.5.3) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0) - ethon (0.16.0) - ffi (>= 1.15.0) - eventmachine (1.2.7) - execjs (2.8.1) - faraday (2.7.7) - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-net_http (3.0.2) - ffi (1.15.5) - forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (228) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.3) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.4.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) - jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) - jekyll-include-cache (= 0.2.1) - jekyll-mentions (= 1.6.0) - jekyll-optional-front-matter (= 0.3.2) - jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.16.0) - jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.3) - jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.8.0) - jekyll-sitemap (= 1.4.0) - jekyll-swiss (= 1.0.0) - jekyll-theme-architect (= 0.2.0) - jekyll-theme-cayman (= 0.2.0) - jekyll-theme-dinky (= 0.2.0) - jekyll-theme-hacker (= 0.2.0) - jekyll-theme-leap-day (= 0.2.0) - jekyll-theme-merlot (= 0.2.0) - jekyll-theme-midnight (= 0.2.0) - jekyll-theme-minimal (= 0.2.0) - jekyll-theme-modernist (= 0.2.0) - jekyll-theme-primer (= 0.6.0) - jekyll-theme-slate (= 0.2.0) - jekyll-theme-tactile (= 0.2.0) - jekyll-theme-time-machine (= 0.2.0) - jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) - kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.4) - mercenary (~> 0.3) - minima (= 2.5.1) - nokogiri (>= 1.13.6, < 2.0) - rouge (= 3.26.0) - terminal-table (~> 1.4) - github-pages-health-check (1.17.9) - addressable (~> 2.3) - dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) - typhoeus (~> 1.3) - html-pipeline (2.14.3) - activesupport (>= 2) - nokogiri (>= 1.4) - http_parser.rb (0.8.0) - i18n (1.14.1) - concurrent-ruby (~> 1.0) - jekyll (3.9.3) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (>= 0.7, < 2) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 2.0) - kramdown (>= 1.17, < 3) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 4) - safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) - jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) - coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.4.0) - commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.4.0) - commonmarker (~> 0.23.7) - jekyll (~> 3.9.0) - jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 5.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) - jekyll (>= 3.7, < 5.0) - jekyll-gist (1.5.0) - octokit (~> 4.2) - jekyll-github-metadata (2.13.0) - jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) - jekyll-include-cache (0.2.1) - jekyll (>= 3.7, < 5.0) - jekyll-mentions (1.6.0) - html-pipeline (~> 2.3) - jekyll (>= 3.7, < 5.0) - jekyll-optional-front-matter (0.3.2) - jekyll (>= 3.0, < 5.0) - jekyll-paginate (1.1.0) - jekyll-readme-index (0.3.0) - jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.16.0) - jekyll (>= 3.3, < 5.0) - jekyll-relative-links (0.6.1) - jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.3) - addressable (~> 2.0) - jekyll (>= 3.5, < 5.0) - jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) - rubyzip (>= 1.3.0, < 3.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.8.0) - jekyll (>= 3.8, < 5.0) - jekyll-sitemap (1.4.0) - jekyll (>= 3.7, < 5.0) - jekyll-swiss (1.0.0) - jekyll-theme-architect (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.6.0) - jekyll (> 3.5, < 5.0) - jekyll-github-metadata (~> 2.9) - jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.3) - jekyll (>= 3.3, < 5.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) - html-pipeline (~> 2.2) - jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - liquid (4.0.4) - listen (3.8.0) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - minima (2.5.1) - jekyll (>= 3.5, < 5.0) - jekyll-feed (~> 0.9) - jekyll-seo-tag (~> 2.1) - minitest (5.18.1) - nokogiri (1.15.2-arm64-darwin) - racc (~> 1.4) - octokit (4.25.1) - faraday (>= 1, < 3) - sawyer (~> 0.9) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.7) - racc (1.7.1) - rb-fsevent (0.11.2) - rb-inotify (0.10.1) - ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby2_keywords (0.0.5) - rubyzip (2.3.2) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.9.2) - addressable (>= 2.3.5) - faraday (>= 0.17.3, < 3) - simpleidn (0.2.1) - unf (~> 0.1.4) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unf (0.1.4) - unf_ext - unf_ext (0.0.8.2) - unicode-display_width (1.8.0) - webrick (1.8.1) - -PLATFORMS - arm64-darwin-22 - -DEPENDENCIES - github-pages (~> 228) - http_parser.rb (~> 0.6.0) - jekyll-feed (~> 0.12) - tzinfo (>= 1, < 3) - tzinfo-data - wdm (~> 0.1.1) - webrick (~> 1.8) - -BUNDLED WITH - 2.4.14 diff --git a/LICENSE b/LICENSE deleted file mode 100644 index b3d95a8..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 avimallu - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 2a62b0a..0000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# What is this repo? - -This is the source code for my personal website. You can visit it [here](https://avimallu.github.io/). \ No newline at end of file diff --git a/_config.yml b/_config.yml deleted file mode 100644 index b60e640..0000000 --- a/_config.yml +++ /dev/null @@ -1,56 +0,0 @@ -# Welcome to Jekyll! -# -# This config file is meant for settings that affect your whole blog, values -# which you are expected to set up once and rarely edit after that. If you find -# yourself editing this file very often, consider using Jekyll's data files -# feature for the data you need to update frequently. -# -# For technical reasons, this file is *NOT* reloaded automatically when you use -# 'bundle exec jekyll serve'. If you change this file, please restart the server process. -# -# If you need help with YAML syntax, here are some quick references for you: -# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml -# https://learnxinyminutes.com/docs/yaml/ -# -# Site settings -# These are used to personalize your new site. If you look in the HTML files, -# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. -# You can create any custom variable you would like, and they will be accessible -# in the templates via {{ site.myvariable }}. - -title: Avinash's Blog -email: avimallu@avimallu -show_downloads: False -description: >- # this means to ignore newlines until "baseurl:" - Avinash's personal blog. -baseurl: "" # the subpath of your site, e.g. /blog -url: "" # the base hostname & protocol for your site, e.g. http://example.com -domain: avimallu.github.io -url: https://avimallu.github.io -linkedin_username: avinash-mallya -github_username: avimallu - -# Build settings -remote_theme: pages-themes/hacker@v0.2.0 -plugins: -- jekyll-remote-theme - -# Exclude from processing. -# The following items will not be processed, by default. -# Any item listed under the `exclude:` key here will be automatically added to -# the internal "default list". -# -# Excluded items can be processed by explicitly listing the directories or -# their entries' file path in the `include:` list. -# -# exclude: -# - .sass-cache/ -# - .jekyll-cache/ -# - gemfiles/ -# - Gemfile -# - Gemfile.lock -# - node_modules/ -# - vendor/bundle/ -# - vendor/cache/ -# - vendor/gems/ -# - vendor/ruby/ diff --git a/_includes/head-custom.html b/_includes/head-custom.html deleted file mode 100644 index 44cdb83..0000000 --- a/_includes/head-custom.html +++ /dev/null @@ -1,12 +0,0 @@ - - - -{% include head-custom-theme-colors.html %} - - -{% include head-custom-google-analytics.html %} - - - - - diff --git a/about.md b/about.md deleted file mode 100644 index e2ade15..0000000 --- a/about.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: About Me -permalink: /about_me ---- - -Hi there! - -My name is Avinash Mallya (pronounced Uh-vin-ash Mul-yeah), and I'm a Data Scientist at [WISEcode](https://www.wisecode.ai/). This is my personal blog where I post about some creative ways that I've solved some complex problems in my career, usually with a solid amount of code to make sure that it's helpful for you. - -In my free time, I'm involved in helping folks out at my favourite open-source package repositories, namely [Polars](https://github.com/pola-rs/polars/) and [`data.table`](https://github.com/Rdatatable/data.table). In fact - I've written [some parts](https://pola-rs.github.io/polars-book/user-guide/) of the Polars user-guide. - -You can connect with me on [LinkedIn](https://www.linkedin.com/in/avinash-mallya), or [Github](https://github.com/avimallu). The source code for this website can be found at its [repo on my Github](https://github.com/avimallu/avimallu.github.io) as well. You'll also find source code in the form of text files, Jupyter Notebooks, or R Markdown files on my Github profile. \ No newline at end of file diff --git a/archetypes/default.md b/archetypes/default.md new file mode 100644 index 0000000..25b6752 --- /dev/null +++ b/archetypes/default.md @@ -0,0 +1,5 @@ ++++ +date = '{{ .Date }}' +draft = true +title = '{{ replace .File.ContentBaseName "-" " " | title }}' ++++ diff --git a/assets/001_overlap_joins/overlap_algorithm.xlsx b/assets/001_overlap_joins/overlap_algorithm.xlsx deleted file mode 100644 index a7a79ce..0000000 Binary files a/assets/001_overlap_joins/overlap_algorithm.xlsx and /dev/null differ diff --git a/assets/favicon.png b/assets/favicon.png deleted file mode 100644 index 1de88d5..0000000 Binary files a/assets/favicon.png and /dev/null differ diff --git a/assets/original.css b/assets/original.css new file mode 100644 index 0000000..5c27865 --- /dev/null +++ b/assets/original.css @@ -0,0 +1,187 @@ +code { + text-size-adjust: 100%; + -ms-text-size-adjust: 100%; + -moz-text-size-adjust: 100%; + -webkit-text-size-adjust: 100%; +} + +body { + font-family: Verdana, sans-serif; + margin: auto; + padding: 20px; + max-width: 720px; + text-align: left; + background-color: #1d1f27; + word-wrap: break-word; + overflow-wrap: break-word; + line-height: 1.5; + color: #c9d1d9; +} + +h1, +h2, +h3, +h4, +h5, +h6, +strong, +b { + color: #eee; +} + +a { + color: #8cc2dd; +} + +.title { + text-decoration: none; + border: 0; +} +.title h1 { + font-size: 24px; + margin: 19.92px 0 19.92px 0; +} + +.title span { + font-weight: 400; +} + +nav a { + margin-right: 10px; +} + +textarea { + background-color: #252525; + color: #ddd; + width: 100%; + font-size: 16px; +} + +input { + background-color: #252525; + color: #ddd; + font-size: 16px; +} + +content { + line-height: 1.6; +} + +table { + width: 100%; +} + +table, +th, +td { + border: 1px solid; + border-collapse: collapse; + border-color: #c9d1d9; + padding: 5px; +} + +img { + max-width: 100%; + height: auto; +} + +code { + padding: 2px 5px; + color: #f8f8f2; + background-color: #282a36; +} + +pre code { + display: block; + padding: 20px; + white-space: pre-wrap; + font-size: 14px; + overflow-x: auto; + text-wrap: nowrap; +} + +blockquote { + border-left: 1px solid #999; + color: #ccc; + padding-left: 20px; + font-style: italic; +} + +footer { + padding: 25px; + text-align: center; +} + +.helptext { + color: #aaa; + font-size: small; +} + +.errorlist { + color: #eba613; + font-size: small; +} + +/* blog posts */ +ul.blog-posts { + list-style-type: none; + padding: unset; +} + +ul.blog-posts li { + display: flex; + margin-bottom: 10px; +} + +ul.blog-posts li span { + flex: 0 0 130px; +} + +ul.blog-posts li a:visited { + color: #8b6fcb; +} + +a.blog-tags { + line-height: 2; + margin-right: 12px; +} + +h3.blog-filter { + margin-bottom: 0; +} + +.disabled { + color: currentColor; + cursor: not-allowed; + opacity: 0.7; +} + +p.byline { + font-style: italic; +} + +/* "Skip to main content" link */ +.skip-link { + position: absolute; + top: 5; + transform: translateY(-600%); + transition: transform 0.5s; + background-color: #1d1f27; + padding: 6px; +} + +.skip-link:focus { + transform: translateY(0%); +} + +figure { + margin-inline-start: 0em; + margin-inline-end: 0em; +} + +figcaption > p { + margin-block-start: 0px; + text-align: center; + font-style: italic; + color: #ccc; +} diff --git a/build_website.sh b/build_website.sh deleted file mode 100755 index 414c0c7..0000000 --- a/build_website.sh +++ /dev/null @@ -1 +0,0 @@ -bundle exec jekyll serve diff --git a/content/_index.md b/content/_index.md new file mode 100644 index 0000000..0f68743 --- /dev/null +++ b/content/_index.md @@ -0,0 +1,26 @@ +--- +title: "about" +menu: "main" +weight: 1 +--- + +# Hi there! + +My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and I'm a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off. + +# What's here? + +You'll find the following: + +* A few posts where I show up some creative ways that I've solved complex problems. +* Links to projects that I've worked on, or have contributed to. +* An assortment of random things I've found interesting. + +# Contact + +You can find me on: + +* [LinkedIn](https://www.linkedin.com/in/avinash-mallya) +* [Github](https://github.com/avimallu) + +Please reach out via one of the above if you want to talk. diff --git a/content/blog/001_overlap_joins/index.md b/content/blog/001_overlap_joins/index.md new file mode 100644 index 0000000..2cd619a --- /dev/null +++ b/content/blog/001_overlap_joins/index.md @@ -0,0 +1,495 @@ ++++ +date = '2023-06-22' +draft = false +title = 'Overlap Joins: Number of docker trucks in an interval' ++++ + +# Premise + +I stumbled upon an interesting [Stackoverflow question](https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period) that was linked [via an issue](https://github.com/pola-rs/polars/issues/9467) on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome. + +I'm more of a right-tool-for-the-job person, so I tried to find a better solution. + +# Problem Statement + +Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck's ID. + +```py +import polars as pl # if you don't have polars, run + # pip install 'polars[all]' +data = pl.from_repr(""" +┌─────────────────────┬─────────────────────┬─────┐ +│ arrival_time ┆ departure_time ┆ ID │ +│ --- ┆ --- ┆ --- │ +│ datetime[μs] ┆ datetime[μs] ┆ str │ +╞═════════════════════╪═════════════════════╪═════╡ +│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │ +│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │ +│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │ +│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │ +│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │ +│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │ +│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │ +│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │ +│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │ +└─────────────────────┴─────────────────────┴─────┘ +""") +``` + +We want to identify the number of trucks docked at any given time within a threshold of 1 minute *prior* to the arrival time of a truck, and 1 minute *after* the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data. + +# Finding a solution to the problem + +## Evaluate for a specific row + +Before we find a general solution to this problem, let's consider a specific row to understand the problem better: + +```py +""" +┌─────────────────────┬─────────────────────┬─────┐ +│ arrival_time ┆ departure_time ┆ ID │ +│ --- ┆ --- ┆ --- │ +│ datetime[μs] ┆ datetime[μs] ┆ str │ +╞═════════════════════╪═════════════════════╪═════╡ +│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │ +└─────────────────────┴─────────────────────┴─────┘ +""" +``` + +For this row, we need to find the number of trucks that are there between `2023-01-01 06:31:06` (1 minute prior to the `arrival_time` and `2023-01-01 06:34:48` (1 minute post the `departure_time`). Manually going through the original dataset, we see that `B3`, `C3`, `A6` and `A5` are the truck IDs that qualify - they all are at the station in a duration that is between `2023-01-01 06:31:06` and `2023-01-01 06:34:48`. + +## Visually deriving an algorithm + +There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap *window* relative to the arrival and departure times): + +![The five different ways a period can overlap.](overlap_algorithm.png) + +Take some time to absorb these cases - it's important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements. + +## Writing an SQL query based on the algorithm + +In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It's often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn't quite in this case. + +Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!). + +### Introducing the DuckDB package + +Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that [DuckDB](https://duckdb.org/) provides: + +1. no expensive set-up time (meaning no need for setting up databases, even temporary ones), +2. no dependencies (other than DuckDB itself, just `pip install duckdb`), +3. some very [friendly SQL extensions](https://duckdb.org/2022/05/04/friendlier-sql.html), and +4. ability to work directly on Polars and Pandas DataFrames without conversions + +all with [mind-blowing speed](https://duckdblabs.github.io/db-benchmark/) that stands shoulder-to-shoulder with Polars. We'll also use a few advanced SQL concepts noted below. + +#### Self-joins + +This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them. + +#### A bullet train recap of non-equi joins + +A key concept that we'll use is the idea of joining on a *range* of values rather than a specific value. That is, instead of the usual `LEFT JOIN ON A.column = B.column`, we can do `LEFT JOIN ON A.column <= B.column` for one row in table `A` to match to multiple rows in `B`. DuckDB has a [blog post](https://duckdb.org/2022/05/27/iejoin.html) that outlines this join in detail, including fast implementation. + +#### The concept of `LIST` columns + +DuckDB has first class support for `LIST` columns - that is, each row in a `LIST` column can have a varying length (much like a Python `list`), but must have the exact same datatype (like R's `vector`). Using list columns allow us to eschew the use of an additional `GROUP BY` operation on top of a `WHERE` filter or `SELECT DISTINCT` operation, since we can directly perform those on the `LIST` column itself. + +#### Date algebra + +Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - [lubridate](https://lubridate.tidyverse.org/) from the [tidyverse](https://www.tidyverse.org/) is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying `INTERVAL`s (a special data type that represent a period of time independent of specific time values) to modify `TIMESTAMP` values using addition or subtraction. + +### Tell me the query, PLEASE! + +Okay - had a lot of background. Let's have at it! The query by itself in SQL is (see immediately below for runnable code in Python): + +```sql +SELECT + A.arrival_time + ,A.departure_time + ,A.window_open + ,A.window_close + ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks + ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count + +FROM ( + SELECT * + ,arrival_time - (INTERVAL 1 MINUTE) AS window_open + ,departure_time + (INTERVAL 1 MINUTE) AS window_close + FROM data) A + +LEFT JOIN ( + SELECT * + ,DATEDIFF('seconds', arrival_time, departure_time) AS duration + FROM data) B + +ON ((B.arrival_time <= A.window_open AND + (B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR + (B.arrival_time >= A.window_open AND + B.departure_time <= A.window_close) OR + (B.arrival_time >= A.window_open AND + (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)) +GROUP BY 1, 2, 3, 4 +``` + +A small, succinct query such as this will need a bit of explanation to take it all in. Here's one below, reproducible in Python (make sure to install `duckdb` first!). Expand it to view. + +
SQL with explanation. + +```py +import duckdb as db +db.query(""" + SELECT + A.arrival_time + ,A.departure_time + ,A.window_open + ,A.window_close + -- LIST aggregates the values into a LIST column + -- and LIST_DISTINCT finds the unique values in it + ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks + -- finally, LIST_UNIQUE calculates the unique number of values in it + ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count + + FROM ( + SELECT + * + ,arrival_time - (INTERVAL 1 MINUTE) AS window_open + ,departure_time + (INTERVAL 1 MINUTE) AS window_close + FROM data -- remember we defined data as the Polars DataFrame with our truck station data + ) A + + LEFT JOIN ( + SELECT + * + -- This is the time, in seconds between the arrival and departure of + -- each truck PER ROW in the original data-frame + ,DATEDIFF('seconds', arrival_time, departure_time) AS duration + FROM data -- this is where we perform a self-join + ) B + + ON ( + -- Case 2 in the diagram; + (B.arrival_time <= A.window_open AND + -- Adding the duration here makes sure that the second interval + -- is at least ENDING AFTER the start of the overlap window + (B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR + + -- Case 3 in the diagram - the simplest of all five cases + (B.arrival_time >= A.window_open AND + B.departure_time <= A.window_close) OR + + -- Case 4 in the digram; + (B.arrival_time >= A.window_open AND + -- Subtracting the duration here makes sure that the second interval + -- STARTS BEFORE the end of the overlap window. + (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close) + ) + GROUP BY 1, 2, 3, 4 +""") +``` + +
+ +The output of this query is: + +``` +""" +┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐ +│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │ +│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │ +├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤ +│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │ +│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │ +│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │ +│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │ +│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │ +│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │ +│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │ +│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │ +│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │ +├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤ +│ 9 rows 6 columns (5 shown) │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +""" +``` + +We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with `db.query(...).pl()` and `db.query(...).pd()` respectively. + +## Can we make the SQL simpler? + +Now that we've understood the logic that goes into the query, let's try to optimize the algorithm. We have the three conditions: + +```sql +-- Case 2 in the diagram +(B.arrival_time <= A.window_open AND + (B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR +-- Case 3 in the diagram +(B.arrival_time >= A.window_open AND + B.departure_time <= A.window_close) OR +-- Case 4 in the diagram +(B.arrival_time >= A.window_open AND + (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close) +``` + +What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be *before* the window ends, and the end of the overlap to be *after* the window starts. This can be simplified to just: + +```sql +B.arrival_time <= A.window_close AND +B.departure_time >= A.window_open +``` + +making our query much simpler! + +### Simplified SQL: Part 1 + +We've removed the need for the `duration` calculation algother now. Therefore, we can write: + +```sql +SELECT + A.arrival_time + ,A.departure_time + ,A.window_open + ,A.window_close + ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks + ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count + +FROM ( + SELECT * + ,arrival_time - (INTERVAL 1 MINUTE) AS window_open + ,departure_time + (INTERVAL 1 MINUTE) AS window_close + FROM data) A + +LEFT JOIN data B + +ON ( + B.arrival_time <= A.window_close AND + B.departure_time >= A.window_open +) +GROUP BY 1, 2, 3, 4 +``` + +Can we simplify this even further? + +### Simplification: Part 2 + +I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB's extensive optimizations to simplify our **legibility** by rewriting the query as a cross join: + +```sql +SELECT + A.arrival_time + ,A.departure_time + ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open + ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close + ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks + ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count +FROM data A, data B +WHERE B.arrival_time <= window_close +AND B.departure_time >= window_open +GROUP BY 1, 2, 3, 4 +``` + +Why does this work? Before optimization on DuckDB, this is what the query plan looks like: + +
DuckDB query plan before optimization + +```py +""" +┌───────────────────────────┐ +│ PROJECTION │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ docked_trucks │ +│ docked_truck_count │ +└─────────────┬─────────────┘ +┌─────────────┴─────────────┐ +│ AGGREGATE │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ arrival_time │ +│ departure_time │ +│ window_open │ +│ window_close │ +│ list(ID) │ +└─────────────┬─────────────┘ +┌─────────────┴─────────────┐ +│ FILTER │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ (arrival_time <= │ +│(departure_time + to_m... │ +│ AS BIGINT)))) │ +│ (departure_time >= │ +│(arrival_time - to_min... │ +│ AS BIGINT)))) │ +└─────────────┬─────────────┘ +┌─────────────┴─────────────┐ +│ CROSS_PRODUCT ├──────────────┐ +└─────────────┬─────────────┘ │ +┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +│ ARROW_SCAN ││ ARROW_SCAN │ +└───────────────────────────┘└───────────────────────────┘ +""" +``` + +
+ +After optimization, the `CROSS_PRODUCT` is **automatically** optimized to an **interval join**! + +
DuckDB query after before optimization + +```py +""" +┌───────────────────────────┐ +│ PROJECTION │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ docked_trucks │ +│ docked_truck_count │ +└─────────────┬─────────────┘ +┌─────────────┴─────────────┐ +│ AGGREGATE │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ arrival_time │ +│ departure_time │ +│ window_open │ +│ window_close │ +│ list(ID) │ +└─────────────┬─────────────┘ +┌─────────────┴─────────────┐ +│ COMPARISON_JOIN │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ +│ INNER │ +│ ((departure_time + '00:01 │ +│ :00'::INTERVAL) >= ├──────────────┐ +│ arrival_time) │ │ +│((arrival_time - '00:01:00'│ │ +│ ::INTERVAL) <= │ │ +│ departure_time) │ │ +└─────────────┬─────────────┘ │ +┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +│ ARROW_SCAN ││ ARROW_SCAN │ +└───────────────────────────┘└───────────────────────────┘ +""" +``` + +
+ +So in effect, we're actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn't recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets. + +### How to get query plans? + +I'm glad you asked. Here's the DuckDB [page explaining `EXPLAIN`](https://duckdb.org/docs/guides/meta/explain.html) (heh). Here's the code I used: + +```py +import duckdb as db +db.sql("SET EXPLAIN_OUTPUT='all';") +print(db.query(""" +EXPLAIN +SELECT + A.arrival_time + ,A.departure_time + ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open + ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close + ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks + ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count +FROM data A, data B +WHERE B.arrival_time <= window_close +AND B.departure_time >= window_open +GROUP BY 1, 2, 3, 4 +""").pl()[1, 1]) +``` + +# What are the alternatives? + +## The `data.table` way + +[`data.table`](https://github.com/Rdatatable/data.table) is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely [pick back up](https://github.com/Rdatatable/data.table/issues/5656). It's my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space. + +### The `foverlaps` function + +If this kind of overlapping join is common, shouldn't someone have developed a package for it? Turns out, `data.table` has, and with very specific constraints that make it the perfect solution to our problem (if you don't mind switching over to R, that is). + +The `foverlaps` function has these requirements: + +1. The input `data.table` objects have to be keyed for automatic recognition of columns. +2. The default match type is that it matches all three cases from the image above. Side note: it also has matches for `within` overlap, matching `start` and `end` windows, +3. The last two matching columns in the join condition in `by` must specify the `start` and `end` points of the overlapping window. This isn't a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases. + +### The code, _si_, the code! + +Without further ado: + +```r +library(data.table) +library(lubridate) + +######### BOILERPLATE CODE, NO LOGIC HERE #################### +arrival_time = as_datetime(c( + '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000', + '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000', + '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000', + '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000', + '2023-01-01 06:39:48.000000')) +departure_time = as_datetime(c( + '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000', + '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000', + '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000', + '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000', + '2023-01-01 06:46:10.000000')) +ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6') + +DT = data.table( + arrival_time = arrival_time, + departure_time = departure_time, + ID = ID) +######### BOILERPLATE CODE, NO LOGIC HERE #################### + +# A copy(DT) creates a copy of a data.table that isn't linked +# to the original one, so that changes in it don't reflect in +# the original DT object. +# The `:=` allow assignment by reference (i.e. "in place"). +DT_with_windows = copy(DT)[, `:=`( + window_start = arrival_time - minutes(1), + window_end = departure_time + minutes(1))] + +# This step is necessary for the second table, but not the first, but we +# key both data.tables to make the foverlap code very succinct. +setkeyv(DT, c("arrival_time", "departure_time")) +setkeyv(DT_with_windows, c("window_start", "window_end")) + +# The foverlap function returns a data.table, so we can simply apply +# the usual data.table syntax on it! +# Since we have the same name of some columns in both data.tables, +# the latter table's columns are prefixed with "i." to avoid conflicts. +foverlaps(DT, DT_with_windows)[ + , .(docked_trucks = list(unique(i.ID)), + docked_truck_count = uniqueN(i.ID)) + , .(arrival_time, departure_time)] +``` + +provides us the output: + +```r + arrival_time departure_time docked_trucks docked_truck_count + +1: 2023-01-01 06:23:47 2023-01-01 06:25:08 A1 1 +2: 2023-01-01 06:26:42 2023-01-01 06:28:02 A1 1 +3: 2023-01-01 06:30:20 2023-01-01 06:35:01 A5,A6,B3,C3 4 +4: 2023-01-01 06:32:06 2023-01-01 06:33:48 A5,A6,B3,C3 4 +5: 2023-01-01 06:33:09 2023-01-01 06:36:01 A5,A6,B3,C3 4 +6: 2023-01-01 06:34:08 2023-01-01 06:39:49 A5,A6,B3,C3 4 +7: 2023-01-01 06:36:40 2023-01-01 06:38:34 B3,C3,A6,A5 4 +8: 2023-01-01 06:37:43 2023-01-01 06:40:48 C3,A6,A5 3 +9: 2023-01-01 06:39:48 2023-01-01 06:46:10 C3,A5,A6 3 +``` + +### Considerations for using `data.table` + +The package offers a wonderful, nearly one-stop solution that doesn't require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do? + +Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you'll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call. + diff --git a/assets/001_overlap_joins/overlap_algorithm.png b/content/blog/001_overlap_joins/overlap_algorithm.png similarity index 100% rename from assets/001_overlap_joins/overlap_algorithm.png rename to content/blog/001_overlap_joins/overlap_algorithm.png diff --git a/assets/003_representative_samples/001_Network_Cluster_1.png b/content/blog/002_representative_samples/001_Network_Cluster_1.png similarity index 100% rename from assets/003_representative_samples/001_Network_Cluster_1.png rename to content/blog/002_representative_samples/001_Network_Cluster_1.png diff --git a/assets/003_representative_samples/002_Network_Cluster_2.png b/content/blog/002_representative_samples/002_Network_Cluster_2.png similarity index 100% rename from assets/003_representative_samples/002_Network_Cluster_2.png rename to content/blog/002_representative_samples/002_Network_Cluster_2.png diff --git a/_posts/2023-10-19-Finding_Rep_Samples.md b/content/blog/002_representative_samples/index.md similarity index 98% rename from _posts/2023-10-19-Finding_Rep_Samples.md rename to content/blog/002_representative_samples/index.md index c606f2e..36a93ef 100644 --- a/_posts/2023-10-19-Finding_Rep_Samples.md +++ b/content/blog/002_representative_samples/index.md @@ -1,7 +1,7 @@ --- title: Finding representative samples efficiently for large datasets -permalink: /RepresentativeSample author: Avinash Mallya +date: 2023-10-19 tags: [representative, samples, faiss, approximate, nearest, neighbor, network, graph, networkx, polars, category] --- @@ -234,11 +234,11 @@ The next step in the process is to create a network graph using the edge-list. B Remember that we have identified the (k=5) nearest neighbors of **each** data point. Let's say that we have a point A that has a nearest neighbor B. C is **not** a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular *minimum thershold*, then A will be connected to C through B! Hopefully a small visual below would help. -![How a network component is formed.](./assets/003_representative_samples/001_Network_Cluster_1.png) +![How a network component is formed.](001_Network_Cluster_1.png) What happens when such a concept is extended for many data points? Not all of them would be connected - because we're applying a *minimum* threshold that they have to meet. This is the only hueristic part of the rather fast process. Here's one more helpful visual: -![How a network cluster is formed.](./assets/003_representative_samples/002_Network_Cluster_2.png) +![How a network cluster is formed.](002_Network_Cluster_2.png) Very starry night-eque vibes here. Let's get to the code. @@ -407,4 +407,4 @@ If you want to write down an algorithmic way of looking at this approach, 2. Create an ANN database (based on a package such as `faiss`) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step. 3. Obtain an edge-list of k (from 5 to 100) nearest neighbors for **all** (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database. 4. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created. -5. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample! \ No newline at end of file +5. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample! diff --git a/assets/002_power_point_snap/01_Shapes.png b/content/blog/003_powerpointsnap/01_Shapes.png similarity index 100% rename from assets/002_power_point_snap/01_Shapes.png rename to content/blog/003_powerpointsnap/01_Shapes.png diff --git a/assets/002_power_point_snap/02_Charts.png b/content/blog/003_powerpointsnap/02_Charts.png similarity index 100% rename from assets/002_power_point_snap/02_Charts.png rename to content/blog/003_powerpointsnap/02_Charts.png diff --git a/assets/002_power_point_snap/03_Tables.png b/content/blog/003_powerpointsnap/03_Tables.png similarity index 100% rename from assets/002_power_point_snap/03_Tables.png rename to content/blog/003_powerpointsnap/03_Tables.png diff --git a/assets/002_power_point_snap/DataLabelsScreenshot.JPG b/content/blog/003_powerpointsnap/DataLabelsScreenshot.JPG similarity index 100% rename from assets/002_power_point_snap/DataLabelsScreenshot.JPG rename to content/blog/003_powerpointsnap/DataLabelsScreenshot.JPG diff --git a/assets/002_power_point_snap/Revenue_Presentation_1.png b/content/blog/003_powerpointsnap/Revenue_Presentation_1.png similarity index 100% rename from assets/002_power_point_snap/Revenue_Presentation_1.png rename to content/blog/003_powerpointsnap/Revenue_Presentation_1.png diff --git a/assets/002_power_point_snap/Revenue_Presentation_2.png b/content/blog/003_powerpointsnap/Revenue_Presentation_2.png similarity index 100% rename from assets/002_power_point_snap/Revenue_Presentation_2.png rename to content/blog/003_powerpointsnap/Revenue_Presentation_2.png diff --git a/assets/002_power_point_snap/SnapScreenshot.JPG b/content/blog/003_powerpointsnap/SnapScreenshot.JPG similarity index 100% rename from assets/002_power_point_snap/SnapScreenshot.JPG rename to content/blog/003_powerpointsnap/SnapScreenshot.JPG diff --git a/assets/002_power_point_snap/Table_Presentation_1.png b/content/blog/003_powerpointsnap/Table_Presentation_1.png similarity index 100% rename from assets/002_power_point_snap/Table_Presentation_1.png rename to content/blog/003_powerpointsnap/Table_Presentation_1.png diff --git a/assets/002_power_point_snap/Table_Presentation_2.png b/content/blog/003_powerpointsnap/Table_Presentation_2.png similarity index 100% rename from assets/002_power_point_snap/Table_Presentation_2.png rename to content/blog/003_powerpointsnap/Table_Presentation_2.png diff --git a/_posts/2023-10-20-PowerPointSnap.md b/content/blog/003_powerpointsnap/index.md similarity index 89% rename from _posts/2023-10-20-PowerPointSnap.md rename to content/blog/003_powerpointsnap/index.md index dff11bc..d4ac5a1 100644 --- a/_posts/2023-10-20-PowerPointSnap.md +++ b/content/blog/003_powerpointsnap/index.md @@ -1,7 +1,7 @@ --- title: Quick hacks to make client-ready presentations -permalink: /PowerPointSnap author: Avinash Mallya +date: 2023-10-20 tags: [powerpoint, ppt, vba] --- @@ -40,7 +40,7 @@ Here's a non-exhaustive list of all the options available. This is the part of the interface that can be used for shapes (which include charts and tables). -![The UI for copying *shape* properties](./assets/002_power_point_snap/01_Shapes.png) +![The UI for copying *shape* properties](01_Shapes.png) To use, first select a *shape* object, click on "Set". Then, choose the object you want to *Snap* its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit. @@ -50,7 +50,7 @@ Note that it's probably not to apply a property of a shape to a table - if you w Charts are also supported, with dedicated features for it. -![The UI for copying *chart* properties](./assets/002_power_point_snap/02_Charts.png) +![The UI for copying *chart* properties](02_Charts.png) What do these features do? You should be able to hover over the option and get a tooltip that shows what it's capable of, but here's another summary just in case: @@ -67,7 +67,7 @@ Your immediate senior in a consulting environment would frown at your chart, and It's **never** a one time affair. But don't worry, we have this nice feature to help us. If you click on the *Customize Label* option, you will get this (without the "Set" option): -![The UI for customizing labels.](./assets/002_power_point_snap/DataLabelsScreenshot.JPG) +![The UI for customizing labels.](DataLabelsScreenshot.JPG) Never mind the rather unfriendly legend entries. They're just here to demonstrate that you can do the following kinds of whacky abilities with your own chart! @@ -75,7 +75,7 @@ Never mind the rather unfriendly legend entries. They're just here to demonstrat Of course, visuals will do it more justice. For example, look at this image: -![There's a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren't centered.](./assets/002_power_point_snap/Revenue_Presentation_1.png) +![There's a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren't centered.](Revenue_Presentation_1.png) Here's what you can do: @@ -95,7 +95,7 @@ Here's what you can do: This is what your results should look like: -![Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly... maybe they should change some requirements...](./assets/002_power_point_snap/Revenue_Presentation_2.png) +![Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly... maybe they should change some requirements...](Revenue_Presentation_2.png) Of course, getting those calculations right is a whole different thing that will need some work. @@ -103,11 +103,11 @@ Of course, getting those calculations right is a whole different thing that will Oftentimes, you have two tables that show similar values... you know the drill. Here's what you can do in a scenario such as this: -![Similar data, but vastly different tables.](./assets/002_power_point_snap/Table_Presentation_1.png) +![Similar data, but vastly different tables.](Table_Presentation_1.png) This is what the *Tables* section of the tool looks like: -![The UI for *Tables*](./assets/002_power_point_snap/03_Tables.png) +![The UI for *Tables*](03_Tables.png) To align these tables together, @@ -119,7 +119,7 @@ To align these tables together, Here's what you'll end up with: -![Similar data, and similar enough tables.](./assets/002_power_point_snap/Table_Presentation_2.png) +![Similar data, and similar enough tables.](Table_Presentation_2.png) Pretty neat, eh? diff --git a/content/blog/_index.md b/content/blog/_index.md new file mode 100644 index 0000000..af4757a --- /dev/null +++ b/content/blog/_index.md @@ -0,0 +1,5 @@ +--- +title: "blog" +menu: "main" +weight: 2 +--- diff --git a/content/projects.md b/content/projects.md new file mode 100644 index 0000000..729331a --- /dev/null +++ b/content/projects.md @@ -0,0 +1,18 @@ +--- +title: "projects" +menu: "main" +weight: 3 +--- + +Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools. + +# Featured projects + +1. [BorrowChecker](https://avimallu.github.io/BorrowChecker/): A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. [Repository link](https://github.com/avimallu/BorrowChecker). +2. [PowerPointSnap](https://github.com/avimallu/PowerPointSnap): A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying [blog post]({{< ref "blog/003_powerpointsnap">}}). + +# Other work or contributions + +1. [IntelligentReceiptSplitter](https://github.com/avimallu/IntelligentReceiptSplitter): A relatively simple predecessor to [BorrowChecker](https://avimallu.github.io/BorrowChecker/) that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run. +2. [r.data.table.funs](https://github.com/avimallu/r.data.table.funs): A very small set of R functions that use `data.table`, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time. +3. I [wrote](https://github.com/pola-rs/polars-book/pull/364) [several](https://github.com/pola-rs/polars-book/pull/358) [chapters](https://github.com/pola-rs/polars-book/pull/365/files) of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like `data.table` and `dplyr` dominated), so I was eager to make it better for everybody making the switch. diff --git a/favicon.ico b/favicon.ico deleted file mode 100644 index 1de88d5..0000000 Binary files a/favicon.ico and /dev/null differ diff --git a/hugo.toml b/hugo.toml new file mode 100644 index 0000000..9268fa8 --- /dev/null +++ b/hugo.toml @@ -0,0 +1,81 @@ +baseURL = "https://avimallu.dev/" +theme = "hugo-bearcub" +copyright = "© Avinash Mallya" +defaultContentLanguage = "en" + +# Generate a nice robots.txt for SEO +enableRobotsTXT = true + +# Setup syntax highlighting without inline styles. For more information about +# why you'd want to avoid inline styles, see +# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy/style-src#unsafe_inline_styles +[markup] + [markup.highlight] + lineNos = true + lineNumbersInTable = false + # This allows Bear Cub to use a variation of Dracula that is more accessible + # to people with poor eyesight. For more information about color contrast + # and accessibility, see https://web.dev/color-and-contrast-accessibility/ + noClasses = false + [markup.goldmark] + [markup.goldmark.renderer] + unsafe = true + +# Multilingual mode config. More for information about how to setup translation, +# see https://gohugo.io/content-management/multilingual/ +[languages] + [languages.en] + title = "Avinash's Blog" + languageName = "en-US 🇺🇸" + LanguageCode = "en-US" + contentDir = "content" + [languages.en.params] + madeWith = "Design via [Bear Cub](https://github.com/clente/hugo-bearcub)." + +[params] + # The description of your website + # description = "" + + # These images will show up when services want to generate a preview of a link + # to your site. Ignored if `generateSocialCard = true`. For more information + # about previews, see https://gohugo.io/templates/internal#twitter-cards and + # https://gohugo.io/templates/internal#open-graph + images = ["static/favicon.ico"] + + # This title is used as the site_name on the Hugo's internal opengraph + # structured data template + title = "Avinash's Blog" + + # Dates are displayed following the format below. For more information about + # formatting, see https://gohugo.io/functions/format/ + dateFormat = "2006-01-02" + + # If your blog is multilingual but you haven't translated a page, this theme + # will create a disabled link. By setting `hideUntranslated` to true, you can + # have the theme simply not show any link + hideUntranslated = false + + # (EXPERIMENTAL) This theme has two options for its CSS styles: "original" and + # "herman". The former is what you see on Bear Cub's demo (an optimized + # version of Hugo Bear Blog), while the latter has a more modern look based on + # Herman Martinus's version of the Blogster Minimal theme for Astro. + themeStyle = "original" + + # (EXPERIMENTAL) This theme is capable of dynamically generating social cards + # for posts that don't have `images` defined in their front matter; By setting + # `generateSocialCard` to false, you can prevent this behavior. For more + # information see layouts/partials/social_card.html + generateSocialCard = false + + # Social media. Delete any item you aren't using to make sure it won't show up + # in your website's metadata. + [params.social] + # twitter = "example" # Twitter handle (without '@') + # facebook_admin = "0000000000" # Facebook Page Admin ID + + # Author metadata. This is mostly used for the RSS feed of your site, but the + # email is also added to the footer of each post. You can hide the "reply to" + # link by using a `hideReply` param in front matter. + [params.author] + # name = "Avinash Mallya" # Your name as shown in the RSS feed metadata + # email = "nah@example.com" # Added to the footer so readers can reply to posts diff --git a/index.md b/index.md deleted file mode 100644 index 178f41e..0000000 --- a/index.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -# Feel free to add content and custom Front Matter to this file. -# To modify the layout, see https://jekyllrb.com/docs/themes/#overriding-theme-defaults ---- - -# Hey there! - -From LinkedIn or Github? Then you probably want to know a little bit [about me](./about.md). - -Bookmarked this page for some of my informative posts? See a list of them below. - -# Interesting Problems - -[*Fast Overlap Joins* to find the number of trucks at a station during time intervals](./_posts/2023-06-22-overlap_joins.md). - -[Quick PowerPoint hacks to make client-ready presentations](./_posts/2023-10-20-PowerPointSnap.md). - -[Finding representative samples efficiently for large datasets](./_posts/2023-10-19-Finding_Rep_Samples.md). \ No newline at end of file diff --git a/layouts/partials/nav.html b/layouts/partials/nav.html new file mode 100644 index 0000000..b9400df --- /dev/null +++ b/layouts/partials/nav.html @@ -0,0 +1,22 @@ +{{ range .Site.Menus.main.ByWeight }} + {{ .Name }} +{{ end }} +rss + + +{{ $translations := dict }} +{{ range .Translations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} + + +{{ range where .Site.Languages "Lang" "!=" .Page.Lang }} + {{ with (index $translations .Lang) }} + {{ .Language.LanguageName }} + {{ else }} + + {{ if not .Params.hideUntranslated }} + {{ .LanguageName }} + {{ end }} + {{ end }} +{{ end }} diff --git a/public/404.html b/public/404.html new file mode 100644 index 0000000..c67b78d --- /dev/null +++ b/public/404.html @@ -0,0 +1,5 @@ +404
+

Avinash's Blog

404

ʕノ•ᴥ•ʔノ ︵ ┻━┻

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/blog/001_overlap_joins/001_overlap_joins/index.html b/public/blog/001_overlap_joins/001_overlap_joins/index.html new file mode 100644 index 0000000..20ba1cb --- /dev/null +++ b/public/blog/001_overlap_joins/001_overlap_joins/index.html @@ -0,0 +1,569 @@ + + + + + + + +Overlap Joins: Number of docker trucks in an interval | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ +

Overlap Joins: Number of docker trucks in an interval

+ + + +

Premise

+

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+
SQL with explanation. + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+
DuckDB query plan before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+
DuckDB query after before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+ +
+

+ +

+ + +

+ + Reply to this post by email ↪ + +

+ + + +
+
+ © Avinash Mallya | Design via Bear Cub. +
+ + + + + diff --git a/public/blog/001_overlap_joins/index.html b/public/blog/001_overlap_joins/index.html new file mode 100644 index 0000000..e4c8e5e --- /dev/null +++ b/public/blog/001_overlap_joins/index.html @@ -0,0 +1,309 @@ +Overlap Joins: Number of docker trucks in an interval | Avinash's Blog
+

Avinash's Blog

Overlap Joins: Number of docker trucks in an interval

Premise

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

Problem Statement

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

Finding a solution to the problem

Evaluate for a specific row

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

Visually deriving an algorithm

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

The five different ways a period can overlap.

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

Writing an SQL query based on the algorithm

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

Introducing the DuckDB package

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. no dependencies (other than DuckDB itself, just pip install duckdb),
  3. some very friendly SQL extensions, and
  4. ability to work directly on Polars and Pandas DataFrames without conversions

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

Self-joins

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

A bullet train recap of non-equi joins

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

The concept of LIST columns

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

Date algebra

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

Tell me the query, PLEASE!

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

SQL with explanation.
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")

The output of this query is:

"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

Can we make the SQL simpler?

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

Simplified SQL: Part 1

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

Simplification: Part 2

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

DuckDB query plan before optimization
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

DuckDB query after before optimization
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

How to get query plans?

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

The data.table way

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

The foverlaps function

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

The foverlaps function has these requirements:

  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  3. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.

The code, si, the code!

Without further ado:

 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/blog/001_overlap_joins/overlap_algorithm.png b/public/blog/001_overlap_joins/overlap_algorithm.png new file mode 100644 index 0000000..a9e3b35 Binary files /dev/null and b/public/blog/001_overlap_joins/overlap_algorithm.png differ diff --git a/public/blog/002_representative_samples/001_Network_Cluster_1.png b/public/blog/002_representative_samples/001_Network_Cluster_1.png new file mode 100644 index 0000000..827d2bc Binary files /dev/null and b/public/blog/002_representative_samples/001_Network_Cluster_1.png differ diff --git a/public/blog/002_representative_samples/002_Network_Cluster_2.png b/public/blog/002_representative_samples/002_Network_Cluster_2.png new file mode 100644 index 0000000..9dd73e5 Binary files /dev/null and b/public/blog/002_representative_samples/002_Network_Cluster_2.png differ diff --git a/public/blog/002_representative_samples/index.html b/public/blog/002_representative_samples/index.html new file mode 100644 index 0000000..359bb33 --- /dev/null +++ b/public/blog/002_representative_samples/index.html @@ -0,0 +1,243 @@ +Finding representative samples efficiently for large datasets | Avinash's Blog
+

Avinash's Blog

Finding representative samples efficiently for large datasets

Premise

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  3. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.

In a hurry?

Here’s what you need to do:

  1. Read the premise and see if it fits your problem.
  2. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.

Why do we need representative samples?

Generally, three things come to mind:

  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  3. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!

Define the data

This data can be practically anything that can be represented as a 2D matrix.

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

Get a specific dataset

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

Specify the task

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

Craft a small representative sample for each category.

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

Finding representative samples

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

Getting SentenceTransformer embeddings

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

The concept of approximate nearest neighbors

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

I’ll explain why we’re in the nearest neighbor territory in due course.

Building the database

To build the database, all we need is the title_embeddings matrix.

1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

  1. The actual faiss database.
  2. The actual subset of data that was used to build this index.
  3. The label indices with respect to the original data that went into the faiss database.

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

The next step in the process is to create a network graph using the edge-list. But why?

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

How a network component is formed.

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

How a network cluster is formed.

Very starry night-eque vibes here. Let’s get to the code.

1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. Increase the number of nearest neighbors if you want more matches.

Viewing the components

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

The code

 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

If you want to write down an algorithmic way of looking at this approach,

  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  3. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  4. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  5. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!

#representative +#samples +#faiss +#approximate +#nearest +#neighbor +#network +#graph +#networkx +#polars +#category

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/blog/003_powerpointsnap/01_Shapes.png b/public/blog/003_powerpointsnap/01_Shapes.png new file mode 100644 index 0000000..b2a61a2 Binary files /dev/null and b/public/blog/003_powerpointsnap/01_Shapes.png differ diff --git a/public/blog/003_powerpointsnap/02_Charts.png b/public/blog/003_powerpointsnap/02_Charts.png new file mode 100644 index 0000000..d9c2116 Binary files /dev/null and b/public/blog/003_powerpointsnap/02_Charts.png differ diff --git a/public/blog/003_powerpointsnap/03_Tables.png b/public/blog/003_powerpointsnap/03_Tables.png new file mode 100644 index 0000000..665dfc6 Binary files /dev/null and b/public/blog/003_powerpointsnap/03_Tables.png differ diff --git a/public/blog/003_powerpointsnap/DataLabelsScreenshot.JPG b/public/blog/003_powerpointsnap/DataLabelsScreenshot.JPG new file mode 100644 index 0000000..7590613 Binary files /dev/null and b/public/blog/003_powerpointsnap/DataLabelsScreenshot.JPG differ diff --git a/public/blog/003_powerpointsnap/Revenue_Presentation_1.png b/public/blog/003_powerpointsnap/Revenue_Presentation_1.png new file mode 100644 index 0000000..8933a90 Binary files /dev/null and b/public/blog/003_powerpointsnap/Revenue_Presentation_1.png differ diff --git a/public/blog/003_powerpointsnap/Revenue_Presentation_2.png b/public/blog/003_powerpointsnap/Revenue_Presentation_2.png new file mode 100644 index 0000000..f046049 Binary files /dev/null and b/public/blog/003_powerpointsnap/Revenue_Presentation_2.png differ diff --git a/public/blog/003_powerpointsnap/SnapScreenshot.JPG b/public/blog/003_powerpointsnap/SnapScreenshot.JPG new file mode 100644 index 0000000..6c759de Binary files /dev/null and b/public/blog/003_powerpointsnap/SnapScreenshot.JPG differ diff --git a/public/blog/003_powerpointsnap/Table_Presentation_1.png b/public/blog/003_powerpointsnap/Table_Presentation_1.png new file mode 100644 index 0000000..4bd1c62 Binary files /dev/null and b/public/blog/003_powerpointsnap/Table_Presentation_1.png differ diff --git a/public/blog/003_powerpointsnap/Table_Presentation_2.png b/public/blog/003_powerpointsnap/Table_Presentation_2.png new file mode 100644 index 0000000..d5882e6 Binary files /dev/null and b/public/blog/003_powerpointsnap/Table_Presentation_2.png differ diff --git a/public/blog/003_powerpointsnap/index.html b/public/blog/003_powerpointsnap/index.html new file mode 100644 index 0000000..5e2739a --- /dev/null +++ b/public/blog/003_powerpointsnap/index.html @@ -0,0 +1,24 @@ +Quick hacks to make client-ready presentations | Avinash's Blog
+

Avinash's Blog

Quick hacks to make client-ready presentations

Premise

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

What is it?

I’ll write this down as pointers.

  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. It’s Windows only - it’s unlikely to work on MacOS.
  3. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.

How do I get it?

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

  1. Download the Snap.ppam file to your system.
  2. Enable the developer options.
  3. Go to the Developer tab, and click on PowerPoint Add-ins.
  4. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  5. To uninstall, repeat the process, and simply click on Remove this time.

What can I do with it?

Frankly, a LOT. The base concept of this tool is:

  1. “Set” a shape as the one you want to copy a property from.
  2. Select any property from the list to automatically apply it.

Here’s a non-exhaustive list of all the options available.

Apply properties of shapes directly

This is the part of the interface that can be used for shapes (which include charts and tables).

The UI for copying shape properties

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

Beautify charts with Snappable properties

Charts are also supported, with dedicated features for it.

The UI for copying chart properties

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  3. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  4. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.

The next two options deserve their own section.

Customize the labels programmatically

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

The UI for customizing labels.

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

Screenshots of the chart snapability

Of course, visuals will do it more justice. For example, look at this image:

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

Here’s what you can do:

  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. Click on the right chart, and then go through the following:
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    3. You’ll notice that the chart area doesn’t still match, nor does the title.
    4. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    5. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    6. Click on the first chart, and then in Charts, click Sync Value Axis.
  3. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this:
    1. Click on Customize Labels after clicking on either chart.
    2. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    3. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    4. Click the other chart. Then, in Charts, click on Rerun Customization.

This is what your results should look like:

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

Of course, getting those calculations right is a whole different thing that will need some work.

Align table dimensions

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

Similar data, but vastly different tables.

This is what the Tables section of the tool looks like:

The UI for Tables

To align these tables together,

  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. Click on the right table.
  3. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  4. In Tables, click on Sync Column Widths. Now the columns are also the same.
  5. If you try to align by rows, it fails because the number of rows are not the same in the two tables.

Here’s what you’ll end up with:

Similar data, and similar enough tables.

Pretty neat, eh?

#powerpoint +#ppt +#vba

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/blog/index.html b/public/blog/index.html new file mode 100644 index 0000000..28fd00b --- /dev/null +++ b/public/blog/index.html @@ -0,0 +1,21 @@ +blog | Avinash's Blog
+

Avinash's Blog

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/blog/index.xml b/public/blog/index.xml new file mode 100644 index 0000000..c4f27cf --- /dev/null +++ b/public/blog/index.xml @@ -0,0 +1,927 @@ +blog on Avinash's Bloghttps://avimallu.dev/blog/Recent content in blog on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000Quick hacks to make client-ready presentationshttps://avimallu.dev/blog/003_powerpointsnap/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/003_powerpointsnap/<h1 id="premise">Premise</h1> +<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p> +<h1 id="what-is-it">What is it?</h1> +<p>I&rsquo;ll write this down as pointers.</p> +<ol> +<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li> +<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li> +<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li> +</ol> +<h1 id="how-do-i-get-it">How do I get it?</h1> +<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>Premise +

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

+

What is it?

+

I’ll write this down as pointers.

+
    +
  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. +
  3. It’s Windows only - it’s unlikely to work on MacOS.
  4. +
  5. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.
  6. +
+

How do I get it?

+

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

+
    +
  1. Download the Snap.ppam file to your system.
  2. +
  3. Enable the developer options.
  4. +
  5. Go to the Developer tab, and click on PowerPoint Add-ins.
  6. +
  7. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  8. +
  9. To uninstall, repeat the process, and simply click on Remove this time.
  10. +
+

What can I do with it?

+

Frankly, a LOT. The base concept of this tool is:

+
    +
  1. “Set” a shape as the one you want to copy a property from.
  2. +
  3. Select any property from the list to automatically apply it.
  4. +
+

Here’s a non-exhaustive list of all the options available.

+

Apply properties of shapes directly

+

This is the part of the interface that can be used for shapes (which include charts and tables).

+

The UI for copying shape properties

+

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

+

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

+

Beautify charts with Snappable properties

+

Charts are also supported, with dedicated features for it.

+

The UI for copying chart properties

+

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

+
    +
  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. +
  3. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  4. +
  5. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  6. +
  7. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.
  8. +
+

The next two options deserve their own section.

+

Customize the labels programmatically

+

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

+

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

+

The UI for customizing labels.

+

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

+

Screenshots of the chart snapability

+

Of course, visuals will do it more justice. For example, look at this image:

+

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

+

Here’s what you can do:

+
    +
  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right chart, and then go through the following: +
      +
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. +
    3. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    4. +
    5. You’ll notice that the chart area doesn’t still match, nor does the title.
    6. +
    7. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    8. +
    9. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    10. +
    11. Click on the first chart, and then in Charts, click Sync Value Axis.
    12. +
    +
  4. +
  5. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this: +
      +
    1. Click on Customize Labels after clicking on either chart.
    2. +
    3. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    4. +
    5. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    6. +
    7. Click the other chart. Then, in Charts, click on Rerun Customization.
    8. +
    +
  6. +
+

This is what your results should look like:

+

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

+

Of course, getting those calculations right is a whole different thing that will need some work.

+

Align table dimensions

+

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

+

Similar data, but vastly different tables.

+

This is what the Tables section of the tool looks like:

+

The UI for Tables

+

To align these tables together,

+
    +
  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right table.
  4. +
  5. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  6. +
  7. In Tables, click on Sync Column Widths. Now the columns are also the same.
  8. +
  9. If you try to align by rows, it fails because the number of rows are not the same in the two tables.
  10. +
+

Here’s what you’ll end up with:

+

Similar data, and similar enough tables.

+

Pretty neat, eh?

+]]>
Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
Overlap Joins: Number of docker trucks in an intervalhttps://avimallu.dev/blog/001_overlap_joins/Thu, 22 Jun 2023 00:00:00 +0000https://avimallu.dev/blog/001_overlap_joins/<h1 id="premise">Premise</h1> +<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p> +<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p> +<h1 id="problem-statement">Problem Statement</h1> +<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>Premise +

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+
SQL with explanation. + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+
DuckDB query plan before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+
DuckDB query after before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+]]>
\ No newline at end of file diff --git a/public/categories/index.html b/public/categories/index.html new file mode 100644 index 0000000..8db3705 --- /dev/null +++ b/public/categories/index.html @@ -0,0 +1,5 @@ +Categories | Avinash's Blog
+

Avinash's Blog

Filtering for "Categories"

  • No posts yet
© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/categories/index.xml b/public/categories/index.xml new file mode 100644 index 0000000..96562d3 --- /dev/null +++ b/public/categories/index.xml @@ -0,0 +1 @@ +Categories on Avinash's Bloghttps://avimallu.dev/categories/Recent content in Categories on Avinash's BlogHugo -- gohugo.ioen-US© Avinash Mallya \ No newline at end of file diff --git a/public/favicon.ico b/public/favicon.ico new file mode 100644 index 0000000..5c7d39b Binary files /dev/null and b/public/favicon.ico differ diff --git a/public/herman.min.css b/public/herman.min.css new file mode 100644 index 0000000..def88db --- /dev/null +++ b/public/herman.min.css @@ -0,0 +1 @@ +:root{font-size:62.5%;--color-dark:#181a20;--color-light:#fafafa;--color-primary:#1a8fe3;--size:1rem;--spacing:calc(var(--size) * 2.4)}body{background:var(--color-dark);color:var(--color-light);padding:4rem;font-family:Avenir,avenir next lt pro,Montserrat,Corbel,urw gothic,source-sans-pro,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol,noto color emoji;font-size:calc(var(--size) * 1.8);line-height:1.5;min-height:80vh;max-width:1600px;margin:0 auto;word-wrap:break-word}header,main,footer{max-width:70ch;margin-inline:auto}header{padding-bottom:var(--spacing)}nav a,a.blog-tags{margin-right:calc(var(--spacing)/2)}a.blog-tags{line-height:2}main{padding-bottom:var(--spacing)}footer{text-align:center;padding-top:var(--spacing)}a{color:currentColor;text-decoration-color:var(--color-primary);text-decoration-thickness:.3ex;text-underline-offset:.3ex}a:hover{text-decoration-thickness:.4ex}img{display:block;max-width:100%;height:auto}h1,h2,h3,h4{font-weight:700;line-height:1.3}h1{font-size:calc(var(--size) * 4.2)}h2{font-size:calc(var(--size) * 3.4)}h3{font-size:calc(var(--size) * 2.6)}h4{font-size:calc(var(--size) * 1.8)}ul,ol{padding-inline-start:var(--spacing)}li{margin-block-start:var(--spacing)}blockquote{padding-inline-start:var(--spacing);border-inline-start:.2em solid;font-style:italic;max-width:50ch}:is(h1,h2,h3,h4,blockquote){margin-block-end:calc(var(--spacing)/2)}:is(h1,h2,h3,h4)+*{margin-block-start:calc(var(--spacing)/3)}:is(h1,h2,h3,h4)+:where(h2,h3,h4){margin-block-start:calc(var(--spacing) * 2)}.title{text-decoration:none}.title h1{font-size:calc(var(--size) * 3.4);margin-top:calc(var(--spacing)/2)}ul.blog-posts{list-style-type:none;padding:unset}ul.blog-posts li{display:flex;flex-direction:column}ul.blog-posts li span{min-width:11ch}p.byline{opacity:.5}code{font-family:ui-monospace,cascadia code,source code pro,Menlo,Consolas,dejavu sans mono,monospace;padding:2px calc(var(--spacing)/4);background-color:#282a36;font-size:calc(var(--size) * 1.4)}pre code{display:block;padding:var(--spacing);overflow-x:auto;-webkit-text-size-adjust:100%;-moz-text-size-adjust:100%}table{width:100%}table,th,td{border:1px solid;border-collapse:collapse;border-color:var(--color-light);padding:calc(var(--spacing)/2)}.disabled{color:currentColor;cursor:not-allowed;opacity:.5}@media screen and (min-width:600px){ul.blog-posts li{flex-direction:row;gap:calc(var(--spacing)/2)}}.skip-link{position:absolute;top:5;transform:translateY(-600%);transition:transform .5s;background-color:#181a20;padding:6px}.skip-link:focus{transform:translateY(0%)}figure{margin-inline-start:0;margin-inline-end:0}figcaption>p{margin-block-start:9px;text-align:center;font-style:italic} \ No newline at end of file diff --git a/public/index.html b/public/index.html new file mode 100644 index 0000000..e58d664 --- /dev/null +++ b/public/index.html @@ -0,0 +1,11 @@ +about | Avinash's Blog
+

Avinash's Blog

Hi there!

My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and I’m a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.

What’s here?

You’ll find the following:

  • A few posts where I show up some creative ways that I’ve solved complex problems.
  • Links to projects that I’ve worked on, or have contributed to.
  • An assortment of random things I’ve found interesting.

Contact

You can find me on:

Please reach out via one of the above if you want to talk.

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/index.xml b/public/index.xml new file mode 100644 index 0000000..5b7510e --- /dev/null +++ b/public/index.xml @@ -0,0 +1,950 @@ +Avinash's Bloghttps://avimallu.dev/Recent content on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000projectshttps://avimallu.dev/projects/Mon, 01 Jan 0001 00:00:00 +0000https://avimallu.dev/projects/<p>Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.</p> +<h1 id="featured-projects">Featured projects</h1> +<ol> +<li><a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href="https://github.com/avimallu/BorrowChecker">Repository link</a>.</li> +<li><a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href="https://avimallu.dev/blog/003_powerpointsnap/">blog post</a>.</li> +</ol> +<h1 id="other-work-or-contributions">Other work or contributions</h1> +<ol> +<li><a href="https://github.com/avimallu/IntelligentReceiptSplitter">IntelligentReceiptSplitter</a>: A relatively simple predecessor to <a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a> that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.</li> +<li><a href="https://github.com/avimallu/r.data.table.funs">r.data.table.funs</a>: A very small set of R functions that use <code>data.table</code>, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.</li> +<li>I <a href="https://github.com/pola-rs/polars-book/pull/364">wrote</a> <a href="https://github.com/pola-rs/polars-book/pull/358">several</a> <a href="https://github.com/pola-rs/polars-book/pull/365/files">chapters</a> of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like <code>data.table</code> and <code>dplyr</code> dominated), so I was eager to make it better for everybody making the switch.</li> +</ol>Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.

+

Featured projects

+
    +
  1. BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
  2. +
  3. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.
  4. +
+

Other work or contributions

+
    +
  1. IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.
  2. +
  3. r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.
  4. +
  5. I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch.
  6. +
+]]>
Quick hacks to make client-ready presentationshttps://avimallu.dev/blog/003_powerpointsnap/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/003_powerpointsnap/<h1 id="premise">Premise</h1> +<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p> +<h1 id="what-is-it">What is it?</h1> +<p>I&rsquo;ll write this down as pointers.</p> +<ol> +<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li> +<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li> +<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li> +</ol> +<h1 id="how-do-i-get-it">How do I get it?</h1> +<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>Premise +

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

+

What is it?

+

I’ll write this down as pointers.

+
    +
  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. +
  3. It’s Windows only - it’s unlikely to work on MacOS.
  4. +
  5. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.
  6. +
+

How do I get it?

+

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

+
    +
  1. Download the Snap.ppam file to your system.
  2. +
  3. Enable the developer options.
  4. +
  5. Go to the Developer tab, and click on PowerPoint Add-ins.
  6. +
  7. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  8. +
  9. To uninstall, repeat the process, and simply click on Remove this time.
  10. +
+

What can I do with it?

+

Frankly, a LOT. The base concept of this tool is:

+
    +
  1. “Set” a shape as the one you want to copy a property from.
  2. +
  3. Select any property from the list to automatically apply it.
  4. +
+

Here’s a non-exhaustive list of all the options available.

+

Apply properties of shapes directly

+

This is the part of the interface that can be used for shapes (which include charts and tables).

+

The UI for copying shape properties

+

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

+

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

+

Beautify charts with Snappable properties

+

Charts are also supported, with dedicated features for it.

+

The UI for copying chart properties

+

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

+
    +
  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. +
  3. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  4. +
  5. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  6. +
  7. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.
  8. +
+

The next two options deserve their own section.

+

Customize the labels programmatically

+

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

+

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

+

The UI for customizing labels.

+

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

+

Screenshots of the chart snapability

+

Of course, visuals will do it more justice. For example, look at this image:

+

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

+

Here’s what you can do:

+
    +
  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right chart, and then go through the following: +
      +
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. +
    3. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    4. +
    5. You’ll notice that the chart area doesn’t still match, nor does the title.
    6. +
    7. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    8. +
    9. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    10. +
    11. Click on the first chart, and then in Charts, click Sync Value Axis.
    12. +
    +
  4. +
  5. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this: +
      +
    1. Click on Customize Labels after clicking on either chart.
    2. +
    3. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    4. +
    5. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    6. +
    7. Click the other chart. Then, in Charts, click on Rerun Customization.
    8. +
    +
  6. +
+

This is what your results should look like:

+

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

+

Of course, getting those calculations right is a whole different thing that will need some work.

+

Align table dimensions

+

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

+

Similar data, but vastly different tables.

+

This is what the Tables section of the tool looks like:

+

The UI for Tables

+

To align these tables together,

+
    +
  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right table.
  4. +
  5. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  6. +
  7. In Tables, click on Sync Column Widths. Now the columns are also the same.
  8. +
  9. If you try to align by rows, it fails because the number of rows are not the same in the two tables.
  10. +
+

Here’s what you’ll end up with:

+

Similar data, and similar enough tables.

+

Pretty neat, eh?

+]]>
Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
Overlap Joins: Number of docker trucks in an intervalhttps://avimallu.dev/blog/001_overlap_joins/Thu, 22 Jun 2023 00:00:00 +0000https://avimallu.dev/blog/001_overlap_joins/<h1 id="premise">Premise</h1> +<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p> +<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p> +<h1 id="problem-statement">Problem Statement</h1> +<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>Premise +

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+
SQL with explanation. + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+
DuckDB query plan before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+
DuckDB query after before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+]]>
\ No newline at end of file diff --git a/public/original.min.css b/public/original.min.css new file mode 100644 index 0000000..f10e29a --- /dev/null +++ b/public/original.min.css @@ -0,0 +1 @@ +code{text-size-adjust:100%;-ms-text-size-adjust:100%;-moz-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{font-family:Verdana,sans-serif;margin:auto;padding:20px;max-width:720px;text-align:left;background-color:#1d1f27;word-wrap:break-word;overflow-wrap:break-word;line-height:1.5;color:#c9d1d9}h1,h2,h3,h4,h5,h6,strong,b{color:#eee}a{color:#8cc2dd}.title{text-decoration:none;border:0}.title h1{font-size:24px;margin:19.92px 0}.title span{font-weight:400}nav a{margin-right:10px}textarea{background-color:#252525;color:#ddd;width:100%;font-size:16px}input{background-color:#252525;color:#ddd;font-size:16px}content{line-height:1.6}table{width:100%}table,th,td{border:1px solid;border-collapse:collapse;border-color:#c9d1d9;padding:5px}img{max-width:100%;height:auto}code{padding:2px 5px;color:#f8f8f2;background-color:#282a36}pre code{display:block;padding:20px;white-space:pre-wrap;font-size:14px;overflow-x:auto;text-wrap:nowrap}blockquote{border-left:1px solid #999;color:#ccc;padding-left:20px;font-style:italic}footer{padding:25px;text-align:center}.helptext{color:#aaa;font-size:small}.errorlist{color:#eba613;font-size:small}ul.blog-posts{list-style-type:none;padding:unset}ul.blog-posts li{display:flex;margin-bottom:10px}ul.blog-posts li span{flex:0 0 130px}ul.blog-posts li a:visited{color:#8b6fcb}a.blog-tags{line-height:2;margin-right:12px}h3.blog-filter{margin-bottom:0}.disabled{color:currentColor;cursor:not-allowed;opacity:.7}p.byline{font-style:italic}.skip-link{position:absolute;top:5;transform:translateY(-600%);transition:transform .5s;background-color:#1d1f27;padding:6px}.skip-link:focus{transform:translateY(0%)}figure{margin-inline-start:0;margin-inline-end:0}figcaption>p{margin-block-start:0;text-align:center;font-style:italic;color:#ccc} \ No newline at end of file diff --git a/public/posts/001_overlap_joins/index.html b/public/posts/001_overlap_joins/index.html new file mode 100644 index 0000000..fd9ca39 --- /dev/null +++ b/public/posts/001_overlap_joins/index.html @@ -0,0 +1,562 @@ + + + + + + + +Overlap Joins | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ +

Overlap Joins

+ + + +

Premise

+

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+
SQL with explanation. + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+
DuckDB query plan before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+
DuckDB query after before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+ +
+

+ +

+ + +

+ + Reply to this post by email ↪ + +

+ + + +
+
+ Avinash Mallya | Made with Bear Cub +
+ + + + + diff --git a/public/posts/index.html b/public/posts/index.html new file mode 100644 index 0000000..33f4182 --- /dev/null +++ b/public/posts/index.html @@ -0,0 +1,106 @@ + + + + + + + +Posts | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ + + + +
+ +
+ +
+ +
+
+ Avinash Mallya | Made with Bear Cub +
+ + + + + diff --git a/public/posts/index.xml b/public/posts/index.xml new file mode 100644 index 0000000..fc89518 --- /dev/null +++ b/public/posts/index.xml @@ -0,0 +1,463 @@ + + + + Posts on Avinash's Blog + http://localhost:1313/posts/ + Recent content in Posts on Avinash's Blog + Hugo -- gohugo.io + en-US + me@example.com (John Doe) + me@example.com (John Doe) + Avinash Mallya + Thu, 22 Jun 2023 17:27:50 -0400 + + + Overlap Joins + http://localhost:1313/posts/001_overlap_joins/ + Thu, 22 Jun 2023 17:27:50 -0400me@example.com (John Doe) + http://localhost:1313/posts/001_overlap_joins/ + <h1 id="premise">Premise</h1> <p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p> <p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p> <h1 id="problem-statement">Problem Statement</h1> <p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p> + Premise +

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+
SQL with explanation. + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+
DuckDB query plan before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+
DuckDB query after before optimization + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+]]>
+
+
+
diff --git a/public/posts/my-first-post/index.html b/public/posts/my-first-post/index.html new file mode 100644 index 0000000..ea49d7c --- /dev/null +++ b/public/posts/my-first-post/index.html @@ -0,0 +1,110 @@ + + + + + + + +My First Post | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ +

My First Post

+ + + +

Fucking hell.

+ +
+

+ +

+ + +

+ + Reply to this post by email ↪ + +

+ + + +
+
+ Avinash Mallya | Made with Bear Cub +
+ + + + + diff --git a/_posts/2023-06-22-overlap_joins.md b/public/posts/post.exe similarity index 99% rename from _posts/2023-06-22-overlap_joins.md rename to public/posts/post.exe index a51cb4d..3b16073 100644 --- a/_posts/2023-06-22-overlap_joins.md +++ b/public/posts/post.exe @@ -1,9 +1,8 @@ ---- -title: Fast overlap joins in SQL, Python and R -permalink: /docked_trucks_in_interval -author: Avinash Mallya -tags: [python, polars, duckdb, R, data.table, foverlaps, overlap, join] ---- ++++ +date = '2023-06-22T17:27:50-04:00' +draft = false +title = 'Overlap Joins' ++++ # Premise diff --git a/public/posts/post/index.html b/public/posts/post/index.html new file mode 100644 index 0000000..e716566 --- /dev/null +++ b/public/posts/post/index.html @@ -0,0 +1,562 @@ + + + + + + + +Overlap Joins | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ +

Overlap Joins

+ + + +

Premise

+

I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.

+

I’m more of a right-tool-for-the-job person, so I tried to find a better solution.

+

Problem Statement

+

Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck’s ID.

+ + + + + +
 1import polars as pl # if you don't have polars, run 
+ 2                    # pip install 'polars[all]'
+ 3data = pl.from_repr("""
+ 4┌─────────────────────┬─────────────────────┬─────┐
+ 5│ arrival_time        ┆ departure_time      ┆ ID  │
+ 6│ ---                 ┆ ---                 ┆ --- │
+ 7│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+ 8╞═════════════════════╪═════════════════════╪═════╡
+ 9│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1  │
+10│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1  │
+11│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5  │
+12│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+13│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3  │
+14│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3  │
+15│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6  │
+16│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5  │
+17│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6  │
+18└─────────────────────┴─────────────────────┴─────┘
+19""")

We want to identify the number of trucks docked at any given time within a threshold of 1 minute prior to the arrival time of a truck, and 1 minute after the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.

+

Finding a solution to the problem

+

Evaluate for a specific row

+

Before we find a general solution to this problem, let’s consider a specific row to understand the problem better:

+ + + + + +
1"""
+2┌─────────────────────┬─────────────────────┬─────┐
+3│ arrival_time        ┆ departure_time      ┆ ID  │
+4│ ---                 ┆ ---                 ┆ --- │
+5│ datetime[μs]        ┆ datetime[μs]        ┆ str │
+6╞═════════════════════╪═════════════════════╪═════╡
+7│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6  │
+8└─────────────────────┴─────────────────────┴─────┘
+9"""

For this row, we need to find the number of trucks that are there between 2023-01-01 06:31:06 (1 minute prior to the arrival_time and 2023-01-01 06:34:48 (1 minute post the departure_time). Manually going through the original dataset, we see that B3, C3, A6 and A5 are the truck IDs that qualify - they all are at the station in a duration that is between 2023-01-01 06:31:06 and 2023-01-01 06:34:48.

+

Visually deriving an algorithm

+

There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap window relative to the arrival and departure times):

+

The five different ways a period can overlap.

+

Take some time to absorb these cases - it’s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.

+

Writing an SQL query based on the algorithm

+

In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It’s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn’t quite in this case.

+

Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).

+

Introducing the DuckDB package

+

Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that DuckDB provides:

+
    +
  1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
  2. +
  3. no dependencies (other than DuckDB itself, just pip install duckdb),
  4. +
  5. some very friendly SQL extensions, and
  6. +
  7. ability to work directly on Polars and Pandas DataFrames without conversions
  8. +
+

all with mind-blowing speed that stands shoulder-to-shoulder with Polars. We’ll also use a few advanced SQL concepts noted below.

+

Self-joins

+

This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.

+

A bullet train recap of non-equi joins

+

A key concept that we’ll use is the idea of joining on a range of values rather than a specific value. That is, instead of the usual LEFT JOIN ON A.column = B.column, we can do LEFT JOIN ON A.column <= B.column for one row in table A to match to multiple rows in B. DuckDB has a blog post that outlines this join in detail, including fast implementation.

+

The concept of LIST columns

+

DuckDB has first class support for LIST columns - that is, each row in a LIST column can have a varying length (much like a Python list), but must have the exact same datatype (like R’s vector). Using list columns allow us to eschew the use of an additional GROUP BY operation on top of a WHERE filter or SELECT DISTINCT operation, since we can directly perform those on the LIST column itself.

+

Date algebra

+

Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - lubridate from the tidyverse is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying INTERVALs (a special data type that represent a period of time independent of specific time values) to modify TIMESTAMP values using addition or subtraction.

+

Tell me the query, PLEASE!

+

Okay - had a lot of background. Let’s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN (
+16    SELECT *
+17        ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+18    FROM data) B
+19
+20ON ((B.arrival_time <= A.window_open AND 
+21    	(B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+22    (B.arrival_time >= A.window_open AND 
+23                                  B.departure_time  <= A.window_close) OR
+24    (B.arrival_time >= A.window_open AND
+25    	(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
+26GROUP BY 1, 2, 3, 4

A small, succinct query such as this will need a bit of explanation to take it all in. Here’s one below, reproducible in Python (make sure to install duckdb first!). Expand it to view.

+ + + + + + +
 1import duckdb as db
+ 2db.query("""
+ 3    SELECT
+ 4        A.arrival_time
+ 5        ,A.departure_time
+ 6        ,A.window_open
+ 7        ,A.window_close
+ 8        -- LIST aggregates the values into a LIST column
+ 9        -- and LIST_DISTINCT finds the unique values in it
+10        ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11        -- finally, LIST_UNIQUE calculates the unique number of values in it
+12        ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+13
+14    FROM (
+15        SELECT
+16            *
+17            ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+18            ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+19        FROM data -- remember we defined data as the Polars DataFrame with our truck station data
+20    ) A
+21
+22    LEFT JOIN (
+23        SELECT
+24            *
+25            -- This is the time, in seconds between the arrival and departure of
+26            -- each truck PER ROW in the original data-frame 
+27            ,DATEDIFF('seconds', arrival_time, departure_time) AS duration
+28        FROM data -- this is where we perform a self-join
+29    ) B
+30
+31    ON (
+32        -- Case 2 in the diagram;
+33        (B.arrival_time <= A.window_open AND 
+34            -- Adding the duration here makes sure that the second interval
+35            -- is at least ENDING AFTER the start of the overlap window
+36            (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+37
+38        -- Case 3 in the diagram - the simplest of all five cases
+39        (B.arrival_time >= A.window_open AND 
+40                                      B.departure_time  <= A.window_close) OR
+41
+42        -- Case 4 in the digram;
+43        (B.arrival_time >= A.window_open AND
+44            -- Subtracting the duration here makes sure that the second interval
+45            -- STARTS BEFORE the end of the overlap window.
+46            (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
+47    )
+48    GROUP BY 1, 2, 3, 4
+49""")
+

The output of this query is:

+ + + + + +
"""
+┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
+│    arrival_time     │   departure_time    │     window_open     │ … │  docked_trucks   │ docked_truck_count │
+│      timestamp      │      timestamp      │      timestamp      │   │    varchar[]     │       uint64       │
+├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
+│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1]             │                  1 │
+│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │                  4 │
+│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │                  4 │
+│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3]     │                  3 │
+│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3]     │                  3 │
+├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
+│ 9 rows                                                                                  6 columns (5 shown) │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+"""

We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with db.query(...).pl() and db.query(...).pd() respectively.

+

Can we make the SQL simpler?

+

Now that we’ve understood the logic that goes into the query, let’s try to optimize the algorithm. We have the three conditions:

+ + + + + +
1-- Case 2 in the diagram
+2(B.arrival_time <= A.window_open AND 
+3    (B.arrival_time   + TO_SECONDS(B.duration)) >=  A.window_open) OR
+4-- Case 3 in the diagram
+5(B.arrival_time >= A.window_open AND 
+6                              B.departure_time  <= A.window_close) OR
+7-- Case 4 in the diagram
+8(B.arrival_time >= A.window_open AND
+9    (B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)

What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be before the window ends, and the end of the overlap to be after the window starts. This can be simplified to just:

+ + + + + +
1B.arrival_time   <= A.window_close AND
+2B.departure_time >= A.window_open

making our query much simpler!

+

Simplified SQL: Part 1

+

We’ve removed the need for the duration calculation algother now. Therefore, we can write:

+ + + + + +
 1SELECT
+ 2     A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.window_open
+ 5    ,A.window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8
+ 9FROM (
+10    SELECT *
+11        ,arrival_time   - (INTERVAL 1 MINUTE) AS window_open
+12        ,departure_time + (INTERVAL 1 MINUTE) AS window_close
+13    FROM data) A
+14
+15LEFT JOIN data B
+16
+17ON (
+18    B.arrival_time   <= A.window_close AND
+19    B.departure_time >= A.window_open
+20)
+21GROUP BY 1, 2, 3, 4

Can we simplify this even further?

+

Simplification: Part 2

+

I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB’s extensive optimizations to simplify our legibility by rewriting the query as a cross join:

+ + + + + +
 1SELECT
+ 2    A.arrival_time
+ 3    ,A.departure_time
+ 4    ,A.arrival_time - (INTERVAL 1 MINUTE)   AS window_open
+ 5    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+ 6    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+ 7    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+ 8FROM  data A, data B
+ 9WHERE B.arrival_time   <= window_close
+10AND   B.departure_time >= window_open
+11GROUP BY 1, 2, 3, 4

Why does this work? Before optimization on DuckDB, this is what the query plan looks like:

+ + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│           FILTER          │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│     (arrival_time <=      │                             
+25│(departure_time + to_m...  │                             
+26│        AS BIGINT))))      │                             
+27│    (departure_time >=     │                             
+28│(arrival_time - to_min...  │                             
+29│        AS BIGINT))))      │                             
+30└─────────────┬─────────────┘                                                          
+31┌─────────────┴─────────────┐                             
+32│       CROSS_PRODUCT       ├──────────────┐              
+33└─────────────┬─────────────┘              │                                           
+34┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+35│         ARROW_SCAN        ││         ARROW_SCAN        │
+36└───────────────────────────┘└───────────────────────────┘ 
+37"""                            
+

After optimization, the CROSS_PRODUCT is automatically optimized to an interval join!

+ + + + + + +
 1"""
+ 2┌───────────────────────────┐                             
+ 3│         PROJECTION        │                             
+ 4│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+ 5│             0             │                             
+ 6│             1             │                             
+ 7│             2             │                             
+ 8│             3             │                             
+ 9│       docked_trucks       │                             
+10│     docked_truck_count    │                             
+11└─────────────┬─────────────┘                                                          
+12┌─────────────┴─────────────┐                             
+13│         AGGREGATE         │                             
+14│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+15│        arrival_time       │                             
+16│       departure_time      │                             
+17│        window_open        │                             
+18│        window_close       │                             
+19│          list(ID)         │                             
+20└─────────────┬─────────────┘                                                          
+21┌─────────────┴─────────────┐                             
+22│      COMPARISON_JOIN      │                             
+23│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
+24│           INNER           │                             
+25│ ((departure_time + '00:01 │                             
+26│     :00'::INTERVAL) >=    ├──────────────┐              
+27│        arrival_time)      │              │              
+28│((arrival_time - '00:01:00'│              │              
+29│       ::INTERVAL) <=      │              │              
+30│       departure_time)     │              │              
+31└─────────────┬─────────────┘              │                                           
+32┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+33│         ARROW_SCAN        ││         ARROW_SCAN        │
+34└───────────────────────────┘└───────────────────────────┘
+35"""                      
+

So in effect, we’re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn’t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.

+

How to get query plans?

+

I’m glad you asked. Here’s the DuckDB page explaining EXPLAIN (heh). Here’s the code I used:

+ + + + + +
 1import duckdb as db
+ 2db.sql("SET EXPLAIN_OUTPUT='all';")
+ 3print(db.query("""
+ 4EXPLAIN
+ 5SELECT
+ 6    A.arrival_time
+ 7    ,A.departure_time
+ 8    ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
+ 9    ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
+10    ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
+11    ,LIST_UNIQUE(LIST(B.ID))   AS docked_truck_count
+12FROM  data A, data B
+13WHERE B.arrival_time   <= window_close
+14AND   B.departure_time >= window_open
+15GROUP BY 1, 2, 3, 4
+16""").pl()[1, 1])

What are the alternatives?

+

The data.table way

+

data.table is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely pick back up. It’s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.

+

The foverlaps function

+

If this kind of overlapping join is common, shouldn’t someone have developed a package for it? Turns out, data.table has, and with very specific constraints that make it the perfect solution to our problem (if you don’t mind switching over to R, that is).

+

The foverlaps function has these requirements:

+
    +
  1. The input data.table objects have to be keyed for automatic recognition of columns.
  2. +
  3. The default match type is that it matches all three cases from the image above. Side note: it also has matches for within overlap, matching start and end windows,
  4. +
  5. The last two matching columns in the join condition in by must specify the start and end points of the overlapping window. This isn’t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
  6. +
+

The code, si, the code!

+

Without further ado:

+ + + + + +
 1library(data.table)
+ 2library(lubridate)
+ 3
+ 4######### BOILERPLATE CODE, NO LOGIC HERE ####################
+ 5arrival_time = as_datetime(c(
+ 6  '2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
+ 7  '2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
+ 8  '2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
+ 9  '2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
+10  '2023-01-01 06:39:48.000000'))
+11departure_time = as_datetime(c(
+12  '2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
+13  '2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
+14  '2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
+15  '2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
+16  '2023-01-01 06:46:10.000000'))
+17ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
+18
+19DT = data.table(
+20  arrival_time = arrival_time,
+21  departure_time = departure_time,
+22  ID = ID)
+23######### BOILERPLATE CODE, NO LOGIC HERE ####################
+24
+25# A copy(DT) creates a copy of a data.table that isn't linked
+26# to the original one, so that changes in it don't reflect in
+27# the original DT object.
+28# The `:=` allow assignment by reference (i.e. "in place").
+29DT_with_windows = copy(DT)[, `:=`(
+30  window_start   = arrival_time   - minutes(1),
+31  window_end = departure_time + minutes(1))]
+32
+33# This step is necessary for the second table, but not the first, but we
+34# key both data.tables to make the foverlap code very succinct.
+35setkeyv(DT, c("arrival_time", "departure_time"))
+36setkeyv(DT_with_windows, c("window_start", "window_end"))
+37
+38# The foverlap function returns a data.table, so we can simply apply
+39# the usual data.table syntax on it!
+40# Since we have the same name of some columns in both data.tables,
+41# the latter table's columns are prefixed with "i." to avoid conflicts.
+42foverlaps(DT, DT_with_windows)[
+43  , .(docked_trucks = list(unique(i.ID)),
+44      docked_truck_count = uniqueN(i.ID))
+45  , .(arrival_time, departure_time)]

provides us the output:

+ + + + + +
 1          arrival_time      departure_time docked_trucks docked_truck_count
+ 2                <POSc>              <POSc>        <list>              <int>
+ 31: 2023-01-01 06:23:47 2023-01-01 06:25:08            A1                  1
+ 42: 2023-01-01 06:26:42 2023-01-01 06:28:02            A1                  1
+ 53: 2023-01-01 06:30:20 2023-01-01 06:35:01   A5,A6,B3,C3                  4
+ 64: 2023-01-01 06:32:06 2023-01-01 06:33:48   A5,A6,B3,C3                  4
+ 75: 2023-01-01 06:33:09 2023-01-01 06:36:01   A5,A6,B3,C3                  4
+ 86: 2023-01-01 06:34:08 2023-01-01 06:39:49   A5,A6,B3,C3                  4
+ 97: 2023-01-01 06:36:40 2023-01-01 06:38:34   B3,C3,A6,A5                  4
+108: 2023-01-01 06:37:43 2023-01-01 06:40:48      C3,A6,A5                  3
+119: 2023-01-01 06:39:48 2023-01-01 06:46:10      C3,A5,A6                  3

Considerations for using data.table

+

The package offers a wonderful, nearly one-stop solution that doesn’t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?

+

Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you’ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

+ +
+

+ +

+ + +

+ + Reply to this post by email ↪ + +

+ + + +
+ + + + + + diff --git a/public/projects/index.html b/public/projects/index.html new file mode 100644 index 0000000..090c574 --- /dev/null +++ b/public/projects/index.html @@ -0,0 +1,19 @@ +projects | Avinash's Blog
+

Avinash's Blog

Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.

Featured projects

  1. BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
  2. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.

Other work or contributions

  1. IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.
  2. r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.
  3. I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch.

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/projects/index.xml b/public/projects/index.xml new file mode 100644 index 0000000..1c9c7a6 --- /dev/null +++ b/public/projects/index.xml @@ -0,0 +1,25 @@ + + + + Projects on Avinash's Blog + http://localhost:1313/projects/ + Recent content in Projects on Avinash's Blog + Hugo -- gohugo.io + en-US + © Avinash Mallya + + + projects + http://localhost:1313/projects/projects/ + Mon, 01 Jan 0001 00:00:00 +0000 + http://localhost:1313/projects/projects/ + <h1 id="featured-projects">Featured projects</h1> <ol> <li><a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href="https://github.com/avimallu/BorrowChecker">Repository link</a>.</li> <li><a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href="https://avimallu.github.io/PowerPointSnap">blog post</a>.</li> </ol> + Featured projects +
    +
  1. BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
  2. +
  3. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.
  4. +
+]]>
+
+
+
diff --git a/public/projects/projects/index.html b/public/projects/projects/index.html new file mode 100644 index 0000000..18b5f9a --- /dev/null +++ b/public/projects/projects/index.html @@ -0,0 +1,107 @@ + + + + + + + +projects | Avinash's Blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Avinash's Blog

+ +
+
+ + +

Featured projects

+
    +
  1. BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
  2. +
  3. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.
  4. +
+ +
+

+ +

+ + + + +
+
+ © Avinash Mallya | Design via Bear Cub. +
+ + + + + diff --git a/public/robots.txt b/public/robots.txt new file mode 100644 index 0000000..ac5dc4c --- /dev/null +++ b/public/robots.txt @@ -0,0 +1,3 @@ +User-agent: * +Allow: / +Sitemap: https://avimallu.dev/sitemap.xml diff --git a/public/sitemap.xml b/public/sitemap.xml new file mode 100644 index 0000000..494b08c --- /dev/null +++ b/public/sitemap.xml @@ -0,0 +1 @@ +https://avimallu.dev/2023-10-20T00:00:00+00:00https://avimallu.dev/blog/2023-10-20T00:00:00+00:00https://avimallu.dev/projects/https://avimallu.dev/tags/powerpoint/2023-10-20T00:00:00+00:00https://avimallu.dev/tags/ppt/2023-10-20T00:00:00+00:00https://avimallu.dev/blog/003_powerpointsnap/2023-10-20T00:00:00+00:00https://avimallu.dev/tags/2023-10-20T00:00:00+00:00https://avimallu.dev/tags/vba/2023-10-20T00:00:00+00:00https://avimallu.dev/tags/approximate/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/category/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/faiss/2023-10-19T00:00:00+00:00https://avimallu.dev/blog/002_representative_samples/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/graph/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/nearest/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/neighbor/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/network/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/networkx/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/polars/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/representative/2023-10-19T00:00:00+00:00https://avimallu.dev/tags/samples/2023-10-19T00:00:00+00:00https://avimallu.dev/blog/001_overlap_joins/2023-06-22T00:00:00+00:00https://avimallu.dev/categories/ \ No newline at end of file diff --git a/public/syntax.min.css b/public/syntax.min.css new file mode 100644 index 0000000..d50e07d --- /dev/null +++ b/public/syntax.min.css @@ -0,0 +1 @@ +.bg{color:#f8f8f2;background-color:#282a36}.chroma{color:#f8f8f2;background-color:#282a36}.chroma .lnlinks{outline:none;text-decoration:none;color:inherit}.chroma .lntd{vertical-align:top;padding:0;margin:0;border:0}.chroma .lntable{border-spacing:0;padding:0;margin:0;border:0}.chroma .hl{background-color:#ffc}.chroma .lnt{white-space:pre;user-select:none;margin-right:.4em;padding:0 .4em;color:#929292}.chroma .ln{white-space:pre;user-select:none;margin-right:.4em;padding:0 .4em;color:#929292}.chroma .line{display:flex}.chroma .k{color:#ff79c6}.chroma .kc{color:#ff79c6}.chroma .kd{color:#8be9fd;font-style:italic}.chroma .kn{color:#ff79c6}.chroma .kp{color:#ff79c6}.chroma .kr{color:#ff79c6}.chroma .kt{color:#8be9fd}.chroma .na{color:#50fa7b}.chroma .nb{color:#8be9fd;font-style:italic}.chroma .nc{color:#50fa7b}.chroma .nf{color:#50fa7b}.chroma .nl{color:#8be9fd;font-style:italic}.chroma .nt{color:#ff79c6}.chroma .nv{color:#8be9fd;font-style:italic}.chroma .vc{color:#8be9fd;font-style:italic}.chroma .vg{color:#8be9fd;font-style:italic}.chroma .vi{color:#8be9fd;font-style:italic}.chroma .s{color:#f1fa8c}.chroma .sa{color:#f1fa8c}.chroma .sb{color:#f1fa8c}.chroma .sc{color:#f1fa8c}.chroma .dl{color:#f1fa8c}.chroma .sd{color:#f1fa8c}.chroma .s2{color:#f1fa8c}.chroma .se{color:#f1fa8c}.chroma .sh{color:#f1fa8c}.chroma .si{color:#f1fa8c}.chroma .sx{color:#f1fa8c}.chroma .sr{color:#f1fa8c}.chroma .s1{color:#f1fa8c}.chroma .ss{color:#f1fa8c}.chroma .m{color:#bd93f9}.chroma .mb{color:#bd93f9}.chroma .mf{color:#bd93f9}.chroma .mh{color:#bd93f9}.chroma .mi{color:#bd93f9}.chroma .il{color:#bd93f9}.chroma .mo{color:#bd93f9}.chroma .o{color:#ff79c6}.chroma .ow{color:#ff79c6}.chroma .c{color:#8491b8}.chroma .ch{color:#8491b8}.chroma .cm{color:#8491b8}.chroma .c1{color:#8491b8}.chroma .cs{color:#8491b8}.chroma .cp{color:#ff79c6}.chroma .cpf{color:#ff79c6}.chroma .gd{color:#f55}.chroma .ge{text-decoration:underline}.chroma .gh{font-weight:700}.chroma .gi{color:#50fa7b;font-weight:700}.chroma .go{color:#44475a}.chroma .gu{font-weight:700}.chroma .gl{text-decoration:underline} \ No newline at end of file diff --git a/public/tags/approximate/index.html b/public/tags/approximate/index.html new file mode 100644 index 0000000..b7de361 --- /dev/null +++ b/public/tags/approximate/index.html @@ -0,0 +1,6 @@ +Approximate | Avinash's Blog
+

Avinash's Blog

Filtering for "Approximate"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/approximate/index.xml b/public/tags/approximate/index.xml new file mode 100644 index 0000000..1180bce --- /dev/null +++ b/public/tags/approximate/index.xml @@ -0,0 +1,383 @@ +Approximate on Avinash's Bloghttps://avimallu.dev/tags/approximate/Recent content in Approximate on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/category/index.html b/public/tags/category/index.html new file mode 100644 index 0000000..03a9806 --- /dev/null +++ b/public/tags/category/index.html @@ -0,0 +1,6 @@ +Category | Avinash's Blog
+

Avinash's Blog

Filtering for "Category"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/category/index.xml b/public/tags/category/index.xml new file mode 100644 index 0000000..50a0f9c --- /dev/null +++ b/public/tags/category/index.xml @@ -0,0 +1,383 @@ +Category on Avinash's Bloghttps://avimallu.dev/tags/category/Recent content in Category on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/faiss/index.html b/public/tags/faiss/index.html new file mode 100644 index 0000000..3543593 --- /dev/null +++ b/public/tags/faiss/index.html @@ -0,0 +1,6 @@ +Faiss | Avinash's Blog
+

Avinash's Blog

Filtering for "Faiss"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/faiss/index.xml b/public/tags/faiss/index.xml new file mode 100644 index 0000000..d23e437 --- /dev/null +++ b/public/tags/faiss/index.xml @@ -0,0 +1,383 @@ +Faiss on Avinash's Bloghttps://avimallu.dev/tags/faiss/Recent content in Faiss on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/graph/index.html b/public/tags/graph/index.html new file mode 100644 index 0000000..91bf31c --- /dev/null +++ b/public/tags/graph/index.html @@ -0,0 +1,6 @@ +Graph | Avinash's Blog
+

Avinash's Blog

Filtering for "Graph"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/graph/index.xml b/public/tags/graph/index.xml new file mode 100644 index 0000000..bb7876a --- /dev/null +++ b/public/tags/graph/index.xml @@ -0,0 +1,383 @@ +Graph on Avinash's Bloghttps://avimallu.dev/tags/graph/Recent content in Graph on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/index.html b/public/tags/index.html new file mode 100644 index 0000000..9f0c629 --- /dev/null +++ b/public/tags/index.html @@ -0,0 +1,19 @@ +Tags | Avinash's Blog
+

Avinash's Blog

Filtering for "Tags"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/index.xml b/public/tags/index.xml new file mode 100644 index 0000000..8c2222d --- /dev/null +++ b/public/tags/index.xml @@ -0,0 +1 @@ +Tags on Avinash's Bloghttps://avimallu.dev/tags/Recent content in Tags on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000Powerpointhttps://avimallu.dev/tags/powerpoint/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/powerpoint/Ppthttps://avimallu.dev/tags/ppt/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/ppt/Vbahttps://avimallu.dev/tags/vba/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/vba/Approximatehttps://avimallu.dev/tags/approximate/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/approximate/Categoryhttps://avimallu.dev/tags/category/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/category/Faisshttps://avimallu.dev/tags/faiss/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/faiss/Graphhttps://avimallu.dev/tags/graph/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/graph/Nearesthttps://avimallu.dev/tags/nearest/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/nearest/Neighborhttps://avimallu.dev/tags/neighbor/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/neighbor/Networkhttps://avimallu.dev/tags/network/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/network/Networkxhttps://avimallu.dev/tags/networkx/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/networkx/Polarshttps://avimallu.dev/tags/polars/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/polars/Representativehttps://avimallu.dev/tags/representative/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/representative/Sampleshttps://avimallu.dev/tags/samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/tags/samples/ \ No newline at end of file diff --git a/public/tags/nearest/index.html b/public/tags/nearest/index.html new file mode 100644 index 0000000..080595d --- /dev/null +++ b/public/tags/nearest/index.html @@ -0,0 +1,6 @@ +Nearest | Avinash's Blog
+

Avinash's Blog

Filtering for "Nearest"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/nearest/index.xml b/public/tags/nearest/index.xml new file mode 100644 index 0000000..ea3f700 --- /dev/null +++ b/public/tags/nearest/index.xml @@ -0,0 +1,383 @@ +Nearest on Avinash's Bloghttps://avimallu.dev/tags/nearest/Recent content in Nearest on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/neighbor/index.html b/public/tags/neighbor/index.html new file mode 100644 index 0000000..2b45803 --- /dev/null +++ b/public/tags/neighbor/index.html @@ -0,0 +1,6 @@ +Neighbor | Avinash's Blog
+

Avinash's Blog

Filtering for "Neighbor"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/neighbor/index.xml b/public/tags/neighbor/index.xml new file mode 100644 index 0000000..0f6e3c9 --- /dev/null +++ b/public/tags/neighbor/index.xml @@ -0,0 +1,383 @@ +Neighbor on Avinash's Bloghttps://avimallu.dev/tags/neighbor/Recent content in Neighbor on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/network/index.html b/public/tags/network/index.html new file mode 100644 index 0000000..16b5731 --- /dev/null +++ b/public/tags/network/index.html @@ -0,0 +1,6 @@ +Network | Avinash's Blog
+

Avinash's Blog

Filtering for "Network"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/network/index.xml b/public/tags/network/index.xml new file mode 100644 index 0000000..b336cb2 --- /dev/null +++ b/public/tags/network/index.xml @@ -0,0 +1,383 @@ +Network on Avinash's Bloghttps://avimallu.dev/tags/network/Recent content in Network on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/networkx/index.html b/public/tags/networkx/index.html new file mode 100644 index 0000000..bd47372 --- /dev/null +++ b/public/tags/networkx/index.html @@ -0,0 +1,6 @@ +Networkx | Avinash's Blog
+

Avinash's Blog

Filtering for "Networkx"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/networkx/index.xml b/public/tags/networkx/index.xml new file mode 100644 index 0000000..4c595dc --- /dev/null +++ b/public/tags/networkx/index.xml @@ -0,0 +1,383 @@ +Networkx on Avinash's Bloghttps://avimallu.dev/tags/networkx/Recent content in Networkx on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/polars/index.html b/public/tags/polars/index.html new file mode 100644 index 0000000..a1f4c24 --- /dev/null +++ b/public/tags/polars/index.html @@ -0,0 +1,6 @@ +Polars | Avinash's Blog
+

Avinash's Blog

Filtering for "Polars"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/polars/index.xml b/public/tags/polars/index.xml new file mode 100644 index 0000000..56f37a9 --- /dev/null +++ b/public/tags/polars/index.xml @@ -0,0 +1,383 @@ +Polars on Avinash's Bloghttps://avimallu.dev/tags/polars/Recent content in Polars on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/powerpoint/index.html b/public/tags/powerpoint/index.html new file mode 100644 index 0000000..4a5fa8a --- /dev/null +++ b/public/tags/powerpoint/index.html @@ -0,0 +1,6 @@ +Powerpoint | Avinash's Blog
+

Avinash's Blog

Filtering for "Powerpoint"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/powerpoint/index.xml b/public/tags/powerpoint/index.xml new file mode 100644 index 0000000..c60b4bb --- /dev/null +++ b/public/tags/powerpoint/index.xml @@ -0,0 +1,101 @@ +Powerpoint on Avinash's Bloghttps://avimallu.dev/tags/powerpoint/Recent content in Powerpoint on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000Quick hacks to make client-ready presentationshttps://avimallu.dev/blog/003_powerpointsnap/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/003_powerpointsnap/<h1 id="premise">Premise</h1> +<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p> +<h1 id="what-is-it">What is it?</h1> +<p>I&rsquo;ll write this down as pointers.</p> +<ol> +<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li> +<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li> +<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li> +</ol> +<h1 id="how-do-i-get-it">How do I get it?</h1> +<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>Premise +

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

+

What is it?

+

I’ll write this down as pointers.

+
    +
  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. +
  3. It’s Windows only - it’s unlikely to work on MacOS.
  4. +
  5. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.
  6. +
+

How do I get it?

+

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

+
    +
  1. Download the Snap.ppam file to your system.
  2. +
  3. Enable the developer options.
  4. +
  5. Go to the Developer tab, and click on PowerPoint Add-ins.
  6. +
  7. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  8. +
  9. To uninstall, repeat the process, and simply click on Remove this time.
  10. +
+

What can I do with it?

+

Frankly, a LOT. The base concept of this tool is:

+
    +
  1. “Set” a shape as the one you want to copy a property from.
  2. +
  3. Select any property from the list to automatically apply it.
  4. +
+

Here’s a non-exhaustive list of all the options available.

+

Apply properties of shapes directly

+

This is the part of the interface that can be used for shapes (which include charts and tables).

+

The UI for copying shape properties

+

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

+

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

+

Beautify charts with Snappable properties

+

Charts are also supported, with dedicated features for it.

+

The UI for copying chart properties

+

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

+
    +
  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. +
  3. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  4. +
  5. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  6. +
  7. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.
  8. +
+

The next two options deserve their own section.

+

Customize the labels programmatically

+

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

+

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

+

The UI for customizing labels.

+

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

+

Screenshots of the chart snapability

+

Of course, visuals will do it more justice. For example, look at this image:

+

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

+

Here’s what you can do:

+
    +
  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right chart, and then go through the following: +
      +
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. +
    3. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    4. +
    5. You’ll notice that the chart area doesn’t still match, nor does the title.
    6. +
    7. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    8. +
    9. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    10. +
    11. Click on the first chart, and then in Charts, click Sync Value Axis.
    12. +
    +
  4. +
  5. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this: +
      +
    1. Click on Customize Labels after clicking on either chart.
    2. +
    3. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    4. +
    5. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    6. +
    7. Click the other chart. Then, in Charts, click on Rerun Customization.
    8. +
    +
  6. +
+

This is what your results should look like:

+

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

+

Of course, getting those calculations right is a whole different thing that will need some work.

+

Align table dimensions

+

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

+

Similar data, but vastly different tables.

+

This is what the Tables section of the tool looks like:

+

The UI for Tables

+

To align these tables together,

+
    +
  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right table.
  4. +
  5. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  6. +
  7. In Tables, click on Sync Column Widths. Now the columns are also the same.
  8. +
  9. If you try to align by rows, it fails because the number of rows are not the same in the two tables.
  10. +
+

Here’s what you’ll end up with:

+

Similar data, and similar enough tables.

+

Pretty neat, eh?

+]]>
\ No newline at end of file diff --git a/public/tags/ppt/index.html b/public/tags/ppt/index.html new file mode 100644 index 0000000..0d21300 --- /dev/null +++ b/public/tags/ppt/index.html @@ -0,0 +1,6 @@ +Ppt | Avinash's Blog
+

Avinash's Blog

Filtering for "Ppt"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/ppt/index.xml b/public/tags/ppt/index.xml new file mode 100644 index 0000000..91fbf20 --- /dev/null +++ b/public/tags/ppt/index.xml @@ -0,0 +1,101 @@ +Ppt on Avinash's Bloghttps://avimallu.dev/tags/ppt/Recent content in Ppt on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000Quick hacks to make client-ready presentationshttps://avimallu.dev/blog/003_powerpointsnap/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/003_powerpointsnap/<h1 id="premise">Premise</h1> +<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p> +<h1 id="what-is-it">What is it?</h1> +<p>I&rsquo;ll write this down as pointers.</p> +<ol> +<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li> +<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li> +<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li> +</ol> +<h1 id="how-do-i-get-it">How do I get it?</h1> +<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>Premise +

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

+

What is it?

+

I’ll write this down as pointers.

+
    +
  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. +
  3. It’s Windows only - it’s unlikely to work on MacOS.
  4. +
  5. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.
  6. +
+

How do I get it?

+

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

+
    +
  1. Download the Snap.ppam file to your system.
  2. +
  3. Enable the developer options.
  4. +
  5. Go to the Developer tab, and click on PowerPoint Add-ins.
  6. +
  7. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  8. +
  9. To uninstall, repeat the process, and simply click on Remove this time.
  10. +
+

What can I do with it?

+

Frankly, a LOT. The base concept of this tool is:

+
    +
  1. “Set” a shape as the one you want to copy a property from.
  2. +
  3. Select any property from the list to automatically apply it.
  4. +
+

Here’s a non-exhaustive list of all the options available.

+

Apply properties of shapes directly

+

This is the part of the interface that can be used for shapes (which include charts and tables).

+

The UI for copying shape properties

+

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

+

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

+

Beautify charts with Snappable properties

+

Charts are also supported, with dedicated features for it.

+

The UI for copying chart properties

+

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

+
    +
  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. +
  3. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  4. +
  5. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  6. +
  7. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.
  8. +
+

The next two options deserve their own section.

+

Customize the labels programmatically

+

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

+

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

+

The UI for customizing labels.

+

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

+

Screenshots of the chart snapability

+

Of course, visuals will do it more justice. For example, look at this image:

+

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

+

Here’s what you can do:

+
    +
  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right chart, and then go through the following: +
      +
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. +
    3. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    4. +
    5. You’ll notice that the chart area doesn’t still match, nor does the title.
    6. +
    7. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    8. +
    9. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    10. +
    11. Click on the first chart, and then in Charts, click Sync Value Axis.
    12. +
    +
  4. +
  5. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this: +
      +
    1. Click on Customize Labels after clicking on either chart.
    2. +
    3. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    4. +
    5. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    6. +
    7. Click the other chart. Then, in Charts, click on Rerun Customization.
    8. +
    +
  6. +
+

This is what your results should look like:

+

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

+

Of course, getting those calculations right is a whole different thing that will need some work.

+

Align table dimensions

+

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

+

Similar data, but vastly different tables.

+

This is what the Tables section of the tool looks like:

+

The UI for Tables

+

To align these tables together,

+
    +
  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right table.
  4. +
  5. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  6. +
  7. In Tables, click on Sync Column Widths. Now the columns are also the same.
  8. +
  9. If you try to align by rows, it fails because the number of rows are not the same in the two tables.
  10. +
+

Here’s what you’ll end up with:

+

Similar data, and similar enough tables.

+

Pretty neat, eh?

+]]>
\ No newline at end of file diff --git a/public/tags/representative/index.html b/public/tags/representative/index.html new file mode 100644 index 0000000..964cfb7 --- /dev/null +++ b/public/tags/representative/index.html @@ -0,0 +1,6 @@ +Representative | Avinash's Blog
+

Avinash's Blog

Filtering for "Representative"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/representative/index.xml b/public/tags/representative/index.xml new file mode 100644 index 0000000..213d14c --- /dev/null +++ b/public/tags/representative/index.xml @@ -0,0 +1,383 @@ +Representative on Avinash's Bloghttps://avimallu.dev/tags/representative/Recent content in Representative on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/samples/index.html b/public/tags/samples/index.html new file mode 100644 index 0000000..e78be10 --- /dev/null +++ b/public/tags/samples/index.html @@ -0,0 +1,6 @@ +Samples | Avinash's Blog
+

Avinash's Blog

Filtering for "Samples"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/samples/index.xml b/public/tags/samples/index.xml new file mode 100644 index 0000000..5746d8c --- /dev/null +++ b/public/tags/samples/index.xml @@ -0,0 +1,383 @@ +Samples on Avinash's Bloghttps://avimallu.dev/tags/samples/Recent content in Samples on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaThu, 19 Oct 2023 00:00:00 +0000Finding representative samples efficiently for large datasetshttps://avimallu.dev/blog/002_representative_samples/Thu, 19 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/002_representative_samples/<h1 id="premise">Premise</h1> +<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p> +<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p> +<ol> +<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li> +<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li> +<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li> +</ol> +<h2 id="in-a-hurry">In a hurry?</h2> +<p>Here&rsquo;s what you need to do:</p>Premise +

In this day and age, we’re not short on data. Good data, on the other hand, is very valuable. When you’ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.

+

Let’s formalize the problem a little so that a proper approach can be developed. Here’s the problem statement:

+
    +
  1. You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
  2. +
  3. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
  4. +
  5. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
  6. +
+

In a hurry?

+

Here’s what you need to do:

+
    +
  1. Read the premise and see if it fits your problem.
  2. +
  3. Go to the For the folks in a hurry! section at the end to find the generic solution and how it works.
  4. +
+

Why do we need representative samples?

+

Generally, three things come to mind:

+
    +
  1. Allows the model to be generalizable for all kinds of data points within a category.
  2. +
  3. Allows for faster training of the model - you need fewer data points to get the same accuracy!
  4. +
  5. Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!
  6. +
+

Define the data

+

This data can be practically anything that can be represented as a 2D matrix.

+

There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they’ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some generic task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.

+

Get a specific dataset

+

For this specific article, I will use the ShopMania dataset on Kaggle. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I’m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:

+
+

NOTE: whenever I want to show an output along with the code I used for it, you’ll see the characters >> indicating the command used, and the output to be without those prefixes.

+
+ + + + + +
 1>> import polars as pl
+ 2>> data = pl.read_csv("archive/shopmania.csv")
+ 3>> data
+ 4shape: (313_705, 4)
+ 5┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐
+ 6 product_ID  product_title                                         category_ID  category_label 
+ 7 ---         ---                                                   ---          ---            
+ 8 i64         str                                                   i64          str            
+ 9╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡
+10 2           twilight central park print                           2            Collectibles   
+11 3           fox print                                             2            Collectibles   
+12 4           circulo de papel wall art                             2            Collectibles   
+13 5           hidden path print                                     2            Collectibles   
+14                                                                                           
+15 313703      deago anti fog swimming diving full face mask         229          Water Sports   
+16             surface snorkel scuba fr gopro black s/m                                          
+17 313704      etc buys full face gopro compatible snorkel scuba     229          Water Sports   
+18             diving mask blue large/xtralarge blue                                             
+19 313705      men 039 s full face breathe free diving snorkel mask  229          Water Sports   
+20             scuba optional hd camera blue mask only adult men                                 
+21 313706      women 039 s full face breathe free diving snorkel     229          Water Sports   
+22             mask scuba optional hd camera black mask only                                     
+23             children and women                                                                
+24└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘

The data documentation on Kaggle states:

+
+

The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.

+
+

For demonstration, I’ll just limit the categories to those that have exactly 10,000 occurences.

+ + + + + +
1data = (
+2    data
+3    .filter(pl.count().over("category_ID") == 10000)
+4)

You’ll notice that there are only 17 categories in this dataset. Run this to verify that fact.

+ + + + + +
 1>>> data.get_column("category_label").unique()
+ 2shape: (17,)
+ 3Series: 'category_label' [str]
+ 4[
+ 5    "Kitchen & Dining"
+ 6    "Scarves and wraps"
+ 7    "Handbags & Wallets"
+ 8    "Rugs  Tapestry & Linens"
+ 9    "Cell Phones Accessories"
+10    "Men's Clothing"
+11    "Jewelry"
+12    "Belts"
+13    "Men Lingerie"
+14    "Crafts"
+15    "Football"
+16    "Medical Supplies"
+17    "Adult"
+18    "Hunting"
+19    "Women's Clothing"
+20    "Pet Supply"
+21    "Office Supplies"
+22]

Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.

+

Specify the task

+

Okay - so now we have exactly 10,000 products per category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:

+
+

Craft a small representative sample for each category.

+
+

Why small? It helps that it’ll make the model faster to train - and keep the training data manageable in size.

+

Finding representative samples

+

I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer’s rather simple: use SentenceTransformers to get a string’s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I’ve noticed that SentenceTransformers are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.

+

Getting SentenceTransformer embeddings

+

This part is rather simple. If you’re unable to install SentenceTransformers, please check their website.

+ + + + + +
1import sentence_transformers
+2# See list of models at www.sbert.net/docs/pretrained_models.html
+3ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+4title_embeddings = (
+5    ST.encode(
+6        data.get_column("product_title").to_list(),
+7        show_progress_bar=True, convert_to_tensor=True)
+8    .numpy())

This process will be slow (~30 minutes) if you don’t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to .numpy() at the end is to directly get a single numpy array - otherwise you get a list of numpy arrays, which is rather inefficient. Further, SentenceTransformers will try to run on the GPU if available, and if so, you will need to write .cpu().numpy() so that the tensor is copied from the GPU to the CPU.

+
+

NOTE: for a proof-of-concept implementation, or if you’re on the CPU, try the all-MiniLM-L6-v2 model. It’s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.

+
+

The concept of approximate nearest neighbors

+

Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. Approximate nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least one of the nearest neighbors (hence the term approximate).

+

There are several algorithms that you can use - I shall proceed with faiss, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are available here.

+

I’ll explain why we’re in the nearest neighbor territory in due course.

+

Building the database

+

To build the database, all we need is the title_embeddings matrix.

+ + + + + +
1import faiss
+2def create_index(title_embeddings):
+3    d = title_embeddings.shape[1]    # Number of dimensions
+4    ann_index = faiss.IndexFlatL2(d) # Index using Eucledian Matrix
+5    ann_index.add(title_embeddings)  # Build the index
+6    
+7    return ann_index # Faiss considers databases an "index"

This does create a database. But remember, we’re trying to find representative samples - which means we need to do this by the category (or label). So let’s design a function that sends only the necessary data as that for a particular category, and then create the database. We’ll need three pieces of information from this function:

+
    +
  1. The actual faiss database.
  2. +
  3. The actual subset of data that was used to build this index.
  4. +
  5. The label indices with respect to the original data that went into the faiss database.
  6. +
+

(2) and (3) will help us later in rebuilding a “network graph” that will allow us to reference the original data points.

+ + + + + +
 1import faiss
+ 2import numpy as np
+ 3import polars as pl
+ 4
+ 5def create_index(label):
+ 6    faiss_indices = (
+ 7        data # this needs to be an argument if you want to create a generic function
+ 8        .with_row_count("row_idx")
+ 9        .filter(pl.col("category_label") == label)
+10        .get_column("row_idx")
+11        .to_list()
+12    )
+13    
+14    faiss_data = title_embeddings[faiss_indices]
+15    d = data.shape[1]               # Number of dimensions
+16    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+17    faiss.normalize_L2(data)        # Normalized L2 with Inner Product search = cosine similarity
+18    # Why cosine similarity? It's easier to specify thresholds - they'll always be between 0 and 1.4.
+19    # If using Eucledian or other distance, we'll have to spend some time finding a good range
+20    # where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.
+21    faiss_DB.add(data)              # Build the index
+22    
+23    return faiss_DB, faiss_data, faiss_indices

Identifying the nearest neighbors

+

To proceed with getting a representative sample, the next step is to find the nearest neighbors for all data points in the database. This isn’t too hard - faiss index objects have a built-in search method to find the k nearest neighbors for a given index, along with the (approximate) distance to it. Let’s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an edge list i.e. a list of pair of nodes that are connected, along with any additional information that specifies a property (in this case distance) of the edge that connects these nodes.

+ + + + + +
 1def get_edge_list(label, k=5):
+ 2    faiss_DB, faiss_data, faiss_indices = create_index(label)
+ 3    # To map the data back to the original `train[b'data']` array
+ 4    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+ 5    # To map the indices back to the original strings
+ 6    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+ 7    distances, neighbors = faiss_DB.search(faiss_data, k)
+ 8    
+ 9    return (
+10        pl.DataFrame({
+11            "from": faiss_indices})
+12        .with_columns(
+13            pl.Series("to", neighbors),
+14            pl.Series("distance", distances))
+15        .explode("to", "distance")
+16        .with_columns(
+17            pl.col("from")
+18            .map_dict(title_name_map),
+19            pl.col("to")
+20            .map_dict(faiss_indices_map)
+21            .map_dict(title_name_map))
+22        .filter(pl.col("from") != pl.col("to"))
+23    )                   

NetworkX and Connected Components

+

The next step in the process is to create a network graph using the edge-list. But why?

+

Remember that we have identified the (k=5) nearest neighbors of each data point. Let’s say that we have a point A that has a nearest neighbor B. C is not a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular minimum thershold, then A will be connected to C through B! Hopefully a small visual below would help.

+

How a network component is formed.

+

What happens when such a concept is extended for many data points? Not all of them would be connected - because we’re applying a minimum threshold that they have to meet. This is the only hueristic part of the rather fast process. Here’s one more helpful visual:

+

How a network cluster is formed.

+

Very starry night-eque vibes here. Let’s get to the code.

+ + + + + +
1import networkx as nx
+2def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+3    edge_list = (
+4        get_edge_list(label, k=k)
+5        .filter(pl.col("distance") >= min_cosine_distance)
+6    )
+7    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+8    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}

Getting clusters

+

Now that all the parts of the puzzle are together, let’s run it to see what kind of clusters you get for Cell Phone Accessories.

+ + + + + +
1clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)

Make sure to configure the following if your results aren’t good enough:

+
    +
  1. Relax the min_cosine_distance value if you want bigger clusters.
  2. +
  3. Increase the number of nearest neighbors if you want more matches.
  4. +
+

Viewing the components

+

There will likely be many clusters (you can see how many exactly with len(clusters)). Let’s look at a random cluster:

+ + + + + +
1>> clusters[3]
+2['smartphone lanyard with card slot for any phone up to 6 yellow 72570099',
+3 'smartphone lanyard with card slot for any phone up to 6 black 72570093',
+4 'smartphone lanyard with card slot for any phone up to 6 lightblue 72570097',
+5 'smartphone lanyard with card slot for any phone up to 6 blue 72570095',
+6 'smartphone lanyard with card slot for any phone up to 6 green 72570101',
+7 'smartphone lanyard with card slot for any phone up to 6 pink 72570091']

Let’s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).

+ + + + + +
 1>>> clusters[6]
+ 2['otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a',
+ 3 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58',
+ 4 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a',
+ 5 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d',
+ 6 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45',
+ 7 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16',
+ 8 ...
+ 9 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20',
+10 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17',
+11 'otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a',
+12 'otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08',
+13 'otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a',
+14 'otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22']

Running for all categories

+

This isn’t that hard (although it may take more than a moment). Just iterate it for each category!

+ + + + + +
1clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

For the folks in a hurry!

+

I get it - you often want a solution that “just works”. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren’t in a hurry, this also serves as a nice summary (and copy-pastable code)!

+

The code

+ + + + + +
 1import sentence_transformers
+ 2import faiss
+ 3import polars as pl
+ 4import numpy as np
+ 5
+ 6# Data is read here. You download the files from Kaggle here: 
+ 7# https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization
+ 8data = pl.read_csv("archive/shopmania.csv", new_columns=[
+ 9    "product_ID", "product_title", "category_ID", "category_label"])
+10data = (
+11    data
+12    .filter(pl.count().over("category_ID") == 10000)
+13    .with_row_count("row_idx")
+14)
+15
+16
+17# See list of models at www.sbert.net/docs/pretrained_models.html
+18ST = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+19title_embeddings = (
+20    ST.encode(
+21        data.get_column("product_title").to_list(),
+22        # I'm on a MacBook, you should use `cuda` or `cpu`
+23        # if you've got different hardware.
+24        device="mps",
+25        show_progress_bar=True, convert_to_tensor=True)
+26    .cpu().numpy())
+27
+28# Code to create a FAISS index
+29def create_index(label):
+30    faiss_indices = (
+31        data # this needs to be an argument if you want to create a generic function
+32        .filter(pl.col("category_label") == label)
+33        .get_column("row_idx")
+34        .to_list()
+35    )
+36    
+37    faiss_data = title_embeddings[faiss_indices]
+38    d = faiss_data.shape[1]         # Number of dimensions
+39    faiss_DB = faiss.IndexFlatIP(d) # Index using Inner Product
+40    faiss.normalize_L2(faiss_data)  # Normalized L2 with Inner Product search = cosine similarity
+41    faiss_DB.add(faiss_data)        # Build the index
+42    
+43    return faiss_DB, faiss_data, faiss_indices
+44
+45# Code to create an edge-list
+46def get_edge_list(label, k=5):
+47    faiss_DB, faiss_data, faiss_indices = create_index(label)
+48    # To map the data back to the original `train[b'data']` array
+49    faiss_indices_map = {i: x for i,x in enumerate(faiss_indices)}
+50    # To map the indices back to the original strings
+51    title_name_map = {i: x for i,x in data.select("row_idx", "product_title").rows()}
+52    distances, neighbors = faiss_DB.search(faiss_data, k)
+53    
+54    return (
+55        pl.DataFrame({
+56            "from": faiss_indices})
+57        .with_columns(
+58            pl.Series("to", neighbors),
+59            pl.Series("distance", distances))
+60        .explode("to", "distance")
+61        .with_columns(
+62            pl.col("from")
+63            .map_dict(title_name_map),
+64            pl.col("to")
+65            .map_dict(faiss_indices_map)
+66            .map_dict(title_name_map))
+67        .filter(pl.col("from") != pl.col("to"))
+68    )
+69
+70# Code to extract components from a Network Graph
+71import networkx as nx
+72def get_cluster_map(label, k=5, min_cosine_distance=0.95):
+73    edge_list = (
+74        get_edge_list(label, k=k)
+75        .filter(pl.col("distance") >= min_cosine_distance)
+76    )
+77    graph = nx.from_pandas_edgelist(edge_list.to_pandas(), source="from", target="to")
+78    return {i: list(x) for i,x in enumerate(nx.connected_components(graph))}
+79
+80# Example call to a single category to obtain its clusters
+81clusters = get_cluster_map("Cell Phones Accessories", 5, 0.95)
+82# Example call to **all** categories to obtain all clusters
+83clusters = [get_cluster_map(x, 5, 0.95) for x in data.get_column("category_label").unique()]

How the code works

+

If you want to write down an algorithmic way of looking at this approach,

+
    +
  1. Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol’ tabular dataset where all numbers are normalized and can be expressed as such.
  2. +
  3. Create an ANN database (based on a package such as faiss) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.
  4. +
  5. Obtain an edge-list of k (from 5 to 100) nearest neighbors for all (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.
  6. +
  7. Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.
  8. +
  9. Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!
  10. +
+]]>
\ No newline at end of file diff --git a/public/tags/vba/index.html b/public/tags/vba/index.html new file mode 100644 index 0000000..d478cda --- /dev/null +++ b/public/tags/vba/index.html @@ -0,0 +1,6 @@ +Vba | Avinash's Blog
+

Avinash's Blog

Filtering for "Vba"

© Avinash Mallya | Design via Bear Cub.
\ No newline at end of file diff --git a/public/tags/vba/index.xml b/public/tags/vba/index.xml new file mode 100644 index 0000000..7e60b87 --- /dev/null +++ b/public/tags/vba/index.xml @@ -0,0 +1,101 @@ +Vba on Avinash's Bloghttps://avimallu.dev/tags/vba/Recent content in Vba on Avinash's BlogHugo -- gohugo.ioen-US© Avinash MallyaFri, 20 Oct 2023 00:00:00 +0000Quick hacks to make client-ready presentationshttps://avimallu.dev/blog/003_powerpointsnap/Fri, 20 Oct 2023 00:00:00 +0000https://avimallu.dev/blog/003_powerpointsnap/<h1 id="premise">Premise</h1> +<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p> +<h1 id="what-is-it">What is it?</h1> +<p>I&rsquo;ll write this down as pointers.</p> +<ol> +<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li> +<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li> +<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li> +</ol> +<h1 id="how-do-i-get-it">How do I get it?</h1> +<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>Premise +

When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (decks in consulting lingo - not even slide decks). However, it was rather repetitive. Thus, was born PowerPointSnap.

+

What is it?

+

I’ll write this down as pointers.

+
    +
  1. It’s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.
  2. +
  3. It’s Windows only - it’s unlikely to work on MacOS.
  4. +
  5. It’s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.
  6. +
+

How do I get it?

+

The project is available on this Github repo. The instructions to install it are available there, but here’s the down-low:

+
    +
  1. Download the Snap.ppam file to your system.
  2. +
  3. Enable the developer options.
  4. +
  5. Go to the Developer tab, and click on PowerPoint Add-ins.
  6. +
  7. Click on Add New. Choose the location of the file you just dowloaded. Click Close.
  8. +
  9. To uninstall, repeat the process, and simply click on Remove this time.
  10. +
+

What can I do with it?

+

Frankly, a LOT. The base concept of this tool is:

+
    +
  1. “Set” a shape as the one you want to copy a property from.
  2. +
  3. Select any property from the list to automatically apply it.
  4. +
+

Here’s a non-exhaustive list of all the options available.

+

Apply properties of shapes directly

+

This is the part of the interface that can be used for shapes (which include charts and tables).

+

The UI for copying shape properties

+

To use, first select a shape object, click on “Set”. Then, choose the object you want to Snap its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.

+

Note that it’s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use Snap.

+

Beautify charts with Snappable properties

+

Charts are also supported, with dedicated features for it.

+

The UI for copying chart properties

+

What do these features do? You should be able to hover over the option and get a tooltip that shows what it’s capable of, but here’s another summary just in case:

+
    +
  1. Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the “set” chart to the one you’ve selected. I couldn’t put in just $x$ and $y$ here because Microsoft internally doesn’t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn’t work well yet for 3D charts.
  2. +
  3. Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all look exactly the same from a client perspective. But that’s usually difficult if you’ve already configured the charts a little - which can be remedied with this option!
  4. +
  5. Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you’ve selected with the way it originally is in the “set” chart. The reason for this feature is simply to avoid going back to Home to click on the Format Painter option again.
  6. +
  7. Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.
  8. +
+

The next two options deserve their own section.

+

Customize the labels programmatically

+

Your immediate senior in a consulting environment would frown at your chart, and then exclaim, “I think that’s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it’s a one time thing!”

+

It’s never a one time affair. But don’t worry, we have this nice feature to help us. If you click on the Customize Label option, you will get this (without the “Set” option):

+

The UI for customizing labels.

+

Never mind the rather unfriendly legend entries. They’re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!

+

Screenshots of the chart snapability

+

Of course, visuals will do it more justice. For example, look at this image:

+

There’s a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren’t centered.

+

Here’s what you can do:

+
    +
  1. Click on the left chart. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right chart, and then go through the following: +
      +
    1. In Shapes, click on Dim. This will align the shapes of the chart.
    2. +
    3. Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.
    4. +
    5. You’ll notice that the chart area doesn’t still match, nor does the title.
    6. +
    7. In Charts, click on Sync Plot Area and Sync Title Area, and watch the magic unfold.
    8. +
    9. Now, click on the second chart, and click on “Set”. Let’s align the axes of the first chart to the second one.
    10. +
    11. Click on the first chart, and then in Charts, click Sync Value Axis.
    12. +
    +
  4. +
  5. Let’s bring that senior’s exclamation back into play - (s)he wants you to highlight only Profit labels, and that too every 2 iterations. To do this: +
      +
    1. Click on Customize Labels after clicking on either chart.
    2. +
    3. You’ll get the screen shown in the previous section. Make sure to adjust the values such that it’s exactly like the screenshot there.
    4. +
    5. Click on “Save and Run”. This will save the configuration you’ve selected, and run it on the chart you’ve selected.
    6. +
    7. Click the other chart. Then, in Charts, click on Rerun Customization.
    8. +
    +
  6. +
+

This is what your results should look like:

+

Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…

+

Of course, getting those calculations right is a whole different thing that will need some work.

+

Align table dimensions

+

Oftentimes, you have two tables that show similar values… you know the drill. Here’s what you can do in a scenario such as this:

+

Similar data, but vastly different tables.

+

This is what the Tables section of the tool looks like:

+

The UI for Tables

+

To align these tables together,

+
    +
  1. Click on the left table. Press “Set” in the toolbar for Snap.
  2. +
  3. Click on the right table.
  4. +
  5. Click on Shapes, inside it, Dim. Now the shapes of the table are the same.
  6. +
  7. In Tables, click on Sync Column Widths. Now the columns are also the same.
  8. +
  9. If you try to align by rows, it fails because the number of rows are not the same in the two tables.
  10. +
+

Here’s what you’ll end up with:

+

Similar data, and similar enough tables.

+

Pretty neat, eh?

+]]>
\ No newline at end of file diff --git a/static/favicon.ico b/static/favicon.ico new file mode 100644 index 0000000..5c7d39b Binary files /dev/null and b/static/favicon.ico differ diff --git a/themes/hugo-bearcub/.github/workflows/gh-pages.yml b/themes/hugo-bearcub/.github/workflows/gh-pages.yml new file mode 100644 index 0000000..9f07252 --- /dev/null +++ b/themes/hugo-bearcub/.github/workflows/gh-pages.yml @@ -0,0 +1,32 @@ +name: github pages + +on: + push: + branches: + - main # Set a branch that will trigger a deployment + pull_request: + +jobs: + deploy: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + with: + submodules: true # Fetch Hugo themes (true OR recursive) + fetch-depth: 0 # Fetch all history for .GitInfo and .Lastmod + + - name: Setup Hugo + uses: peaceiris/actions-hugo@v3 + with: + hugo-version: 'latest' + extended: true + + - name: Build + run: hugo --minify --gc --destination ../public --source ./exampleSite --themesDir ../.. --baseURL https://clente.github.io/hugo-bearcub/ + + - name: Deploy + uses: peaceiris/actions-gh-pages@v4 + if: github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./public diff --git a/themes/hugo-bearcub/.gitignore b/themes/hugo-bearcub/.gitignore new file mode 100644 index 0000000..6f93da2 --- /dev/null +++ b/themes/hugo-bearcub/.gitignore @@ -0,0 +1,5 @@ +.hugo_build.lock +.DS_Store +resources/ +todo.md +exampleSite/public/ \ No newline at end of file diff --git a/themes/hugo-bearcub/LICENSE b/themes/hugo-bearcub/LICENSE new file mode 100644 index 0000000..5c93582 --- /dev/null +++ b/themes/hugo-bearcub/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2020 Jan Raasch +Copyright (c) 2023 Caio Lente + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/themes/hugo-bearcub/README.md b/themes/hugo-bearcub/README.md new file mode 100644 index 0000000..a084e50 --- /dev/null +++ b/themes/hugo-bearcub/README.md @@ -0,0 +1,245 @@ +# ᕦʕ •ᴥ•ʔᕤ Bear Cub + +[![github pages](https://github.com/clente/hugo-bearcub/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/clente/hugo-bearcub/actions/workflows/gh-pages.yml) +[![MIT license](https://img.shields.io/github/license/clente/hugo-bearcub)](https://github.com/clente/hugo-bearcub/blob/main/LICENSE) + +## Overview + +🐻 A lightweight [Hugo](https://gohugo.io/) theme based on [Bear +Blog](https://bearblog.dev) and [Hugo Bear +Blog](https://github.com/janraasch/hugo-bearblog). + +**Bear Cub** takes care of speed and optimization, so you can focus on writing +good content. It is free, multilingual, optimized for search engines, +no-nonsense, responsive, light, and fast. Really fast. + +## Installation + +Follow Hugo's [quick start](https://gohugo.io/getting-started/quick-start/) to +create an empty website and then clone **Bear Cub** into the themes directory as +a [Git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules): + +```sh +git submodule add https://github.com/clente/hugo-bearcub themes/hugo-bearcub +``` + +To finish off, append a line to the site configuration file: + +```sh +echo 'theme = "hugo-bearcub"' >> hugo.toml +``` + +## Features + +Like [Bear Blog](https://bearblog.dev), this theme: +- Is free and open source +- Looks great on any device +- Makes tiny (~5kb), optimized, and awesome pages +- Has no trackers, ads, or scripts +- Automatically generates an RSS feed + +But that's not all! **Bear Cub** is also... + +### Accessible + +**Bear Cub** has a few accessibility upgrades when compared to its predecessors. +The color palette has been overhauled to make sure everything is +[readable](https://web.dev/color-and-contrast-accessibility/) for users with low +vision impairments or color deficiencies, and some interactive elements were +made bigger to facilitate [clicking](https://web.dev/accessible-tap-targets/) +for users with a motor impairment. + +These small changes mean that **Bear Cub** passes Google's [PageSpeed +test](https://pagespeed.web.dev/report?url=https%3A%2F%2Fclente.github.io%2Fhugo-bearcub%2F) +with flying colors. + +![PageSpeed score](https://raw.githubusercontent.com/clente/hugo-bearcub/main/images/pagespeed.webp) + +### Secure + +[**Bear Cub**'s demo](https://clente.github.io/hugo-bearcub/) is hosted on GitHub +and therefore I'm not in control of its [Content Security +Policy](https://infosec.mozilla.org/guidelines/web_security#content-security-policy). +However, the theme itself was made with security in mind: there are no inline +styles and it uses no JavaScript at all. + +If you want to improve your [Mozilla +Observatory](https://observatory.mozilla.org/) score even further, you should be +able to simply add a few headers to your hosting service's configuration (e.g. +[Netlify](https://docs.netlify.com/routing/headers/) or [Cloudflare +Pages](https://developers.cloudflare.com/pages/platform/headers/)) and never +have to think about it again. My `_headers` file, for example, looks like this: + +``` +/* + X-Content-Type-Options: nosniff + Strict-Transport-Security: "max-age=31536000; includeSubDomains; preload" env=HTTPS + Cache-Control: max-age=31536000, public + X-Frame-Options: deny + Referrer-Policy: no-referrer + Feature-Policy: microphone 'none'; payment 'none'; geolocation 'none'; midi 'none'; sync-xhr 'none'; camera 'none'; magnetometer 'none'; gyroscope 'none' + Content-Security-Policy: default-src 'none'; manifest-src 'self'; font-src 'self'; img-src 'self'; style-src 'self'; form-action 'none'; frame-ancestors 'none'; base-uri 'none' + X-XSS-Protection: 1; mode=block +``` + +### Multilingual + +If you need to write a blog that supports more than one language, **Bear Cub** +has you covered! Check out the demo's [`hugo.toml` +file](https://github.com/clente/hugo-bearcub/blob/main/exampleSite/hugo.toml) +for a sample of how you can setup multilingual support. + +By default, the theme creates a translation button that gets disabled when the +current page is only available in any other language. You can also choose to +hide this button (instead of disabling it) by setting `hideUntranslated = +true`. + +### More + +Every once in a while, as I keep using **Bear Cub**, I notice that there is some +functionality missing. Currently, these are the "advanced features" that I have +already implemented: + +- Full-text RSS feed: an enhanced RSS feed template that includes the (properly + encoded) full content of your posts in the feed itself. +- Static content: you can create empty blog entries that act as links to static + files by including `link: "{url}"` in a post's [front + matter](https://gohugo.io/content-management/front-matter/). You can also add + `render: false` to your [build + options](https://gohugo.io/content-management/build-options/#readout) to avoid + rendering blank posts. +- Skip link: a "skip to main content" link that is temporarily invisible, but + can be focused by people who need a keyboard to navigate the web (see [PR + #5](https://github.com/clente/hugo-bearcub/pull/5) by + [@2kool4idkwhat](https://github.com/2kool4idkwhat) for more information). +- Reply by email: if you supply an email address, the theme creates a "Reply to + this post by email" button at the end of every post (see Kev Quirk's [original + implementation](https://kevquirk.com/adding-the-post-title-to-my-reply-by-email-button)). + This button can be suppressed on a case-by-case by setting `hideReply: true` + in a post's [front matter](https://gohugo.io/content-management/front-matter/) + (see [PR #18](https://github.com/clente/hugo-bearcub/pull/18) by + [@chrsmutti](https://github.com/chrsmutti)). +- `absfigure` shortcode: a shortcut to use the `figure` shortcode that also + converts relative URLs into absolute URLs by using the `absURL` function. +- Single-use CSS (EXPERIMENTAL): you can add some styles to a single page by + writing the CSS you need in `assets/{custom_css}.css` and then including + `style: "{custom_css}.css"` in the [front + matter](https://gohugo.io/content-management/front-matter/) of said page. +- Conditional CSS (EXPERIMENTAL): since **Bear Cub** does syntax highlighting + without inline styles (see `hugo.toml` for more information), it only load its + `syntax.css` if, and only if, a code block is actually present in the current + page. +- Alternative "Herman" style (EXPERIMENTAL): if you want to check out a more + modern CSS style, you can change the `themeStyle` parameter to `"herman"` in + order to activate [Herman Martinus's](https://herman.bearblog.dev/) version of + the [Blogster Minimal](https://blogster-minimal.netlify.app/) theme for + [Astro](https://astro.build/). +- Dynamic social card generation (EXPERIMENTAL): if you don't add preview images + to a post, this template will generate one based on the title. You can see an + example below. + +![Social card example](https://raw.githubusercontent.com/clente/hugo-bearcub/main/images/social_card.webp) + +## Configuration + +**Bear Cub** can be customized with a `hugo.toml` file. Check out the +[configuration](https://github.com/clente/hugo-bearcub/blob/main/exampleSite/hugo.toml) +of the [demo](https://clente.github.io/hugo-bearcub/) for more information. + +```toml +# Basic config +baseURL = "https://example.com" +theme = "hugo-bearcub" +copyright = "John Doe (CC BY 4.0)" +defaultContentLanguage = "en" + +# Generate a nice robots.txt for SEO +enableRobotsTXT = true + +# Setup syntax highlighting without inline styles. For more information about +# why you'd want to avoid inline styles, see +# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy/style-src#unsafe_inline_styles +[markup] + [markup.highlight] + lineNos = true + lineNumbersInTable = false + # This allows Bear Cub to use a variation of Dracula that is more accessible + # to people with poor eyesight. For more information about color contrast + # and accessibility, see https://web.dev/color-and-contrast-accessibility/ + noClasses = false + +# Multilingual mode config. More for information about how to setup translation, +# see https://gohugo.io/content-management/multilingual/ +[languages] + [languages.en] + title = "Bear Cub" + languageName = "en-US 🇺🇸" + LanguageCode = "en-US" + contentDir = "content" + [languages.en.params] + madeWith = "Made with [Bear Cub](https://github.com/clente/hugo-bearcub)" + [languages.pt] + title = "Bear Cub" + languageName = "pt-BR 🇧🇷" + LanguageCode = "pt-BR" + contentDir = "content.pt" + [languages.pt.params] + madeWith = "Feito com [Bear Cub](https://github.com/clente/hugo-bearcub)" + +[params] + # The description of your website + description = "Bear Cub Demo" + + # The path to your favicon + favicon = "images/favicon.png" + + # These images will show up when services want to generate a preview of a link + # to your site. Ignored if `generateSocialCard = true`. For more information + # about previews, see https://gohugo.io/templates/internal#twitter-cards and + # https://gohugo.io/templates/internal#open-graph + images = ["images/share.webp"] + + # This title is used as the site_name on the Hugo's internal opengraph + # structured data template + title = "Bear Cub" + + # Dates are displayed following the format below. For more information about + # formatting, see https://gohugo.io/functions/format/ + dateFormat = "2006-01-02" + + # If your blog is multilingual but you haven't translated a page, this theme + # will create a disabled link. By setting `hideUntranslated` to true, you can + # have the theme simply not show any link + hideUntranslated = false + + # (EXPERIMENTAL) This theme has two options for its CSS styles: "original" and + # "herman". The former is what you see on Bear Cub's demo (an optimized + # version of Hugo Bear Blog), while the latter has a more modern look based on + # Herman Martinus's version of the Blogster Minimal theme for Astro. + themeStyle = "original" + + # (EXPERIMENTAL) This theme is capable of dynamically generating social cards + # for posts that don't have `images` defined in their front matter; By setting + # `generateSocialCard` to false, you can prevent this behavior. For more + # information see layouts/partials/social_card.html + generateSocialCard = true + + # Social media. Delete any item you aren't using to make sure it won't show up + # in your website's metadata. + [params.social] + twitter = "example" # Twitter handle (without '@') + facebook_admin = "0000000000" # Facebook Page Admin ID + + # Author metadata. This is mostly used for the RSS feed of your site, but the + # email is also added to the footer of each post. You can hide the "reply to" + # link by using a `hideReply` param in front matter. + [params.author] + name = "John Doe" # Your name as shown in the RSS feed metadata + email = "me@example.com" # Added to the footer so readers can reply to posts +``` + +## Contributing + +If you come across any problems while using **Bear Cub**, you can file an +[issue](https://github.com/clente/hugo-bearcub/issues) or create a [pull +request](https://github.com/clente/hugo-bearcub/pulls). diff --git a/themes/hugo-bearcub/assets/herman.css b/themes/hugo-bearcub/assets/herman.css new file mode 100644 index 0000000..f2573c3 --- /dev/null +++ b/themes/hugo-bearcub/assets/herman.css @@ -0,0 +1,202 @@ +:root { + font-size: 62.5%; /* 10px */ + --color-dark: #181a20; + --color-light: #fafafa; + --color-primary: #1a8fe3; + --size: 1rem; + --spacing: calc(var(--size) * 2.4); +} + +body { + background: var(--color-dark); + color: var(--color-light); + padding: 4rem; + font-family: Avenir, 'Avenir Next LT Pro', Montserrat, Corbel, 'URW Gothic', + source-sans-pro, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", + "Segoe UI Symbol", "Noto Color Emoji"; + font-size: calc(var(--size) * 1.8); + line-height: 1.5; + min-height: 80vh; + max-width: 1600px; + margin: 0 auto; + word-wrap: break-word; +} + +header, +main, +footer { + max-width: 70ch; + margin-inline: auto; +} + +header { + padding-bottom: var(--spacing); +} + +nav a, a.blog-tags { + margin-right: calc(var(--spacing) / 2); +} +a.blog-tags { + line-height: 2; +} + +main { + padding-bottom: var(--spacing); +} + +footer { + text-align: center; + padding-top: var(--spacing); +} + +a { + color: currentColor; + text-decoration-color: var(--color-primary); + text-decoration-thickness: 0.3ex; + text-underline-offset: 0.3ex; +} + +a:hover { + text-decoration-thickness: 0.4ex; +} + +img { + display: block; + max-width: 100%; + height: auto; +} + +h1, +h2, +h3, +h4 { + font-weight: 700; + line-height: 1.3; +} + +h1 { + font-size: calc(var(--size) * 4.2); +} +h2 { + font-size: calc(var(--size) * 3.4); +} +h3 { + font-size: calc(var(--size) * 2.6); +} +h4 { + font-size: calc(var(--size) * 1.8); +} + +ul, +ol { + padding-inline-start: var(--spacing); +} +li { + margin-block-start: var(--spacing); +} + +blockquote { + padding-inline-start: var(--spacing); + border-inline-start: 0.2em solid; + font-style: italic; + max-width: 50ch; +} + +:is(h1, h2, h3, h4, blockquote) { + margin-block-end: calc(var(--spacing) / 2); +} +:is(h1, h2, h3, h4) + * { + margin-block-start: calc(var(--spacing) / 3); +} +:is(h1, h2, h3, h4) + :where(h2, h3, h4) { + margin-block-start: calc(var(--spacing) * 2); +} + +.title { + text-decoration: none; +} +.title h1 { + font-size: calc(var(--size) * 3.4); + margin-top: calc(var(--spacing) / 2); +} + +ul.blog-posts { + list-style-type: none; + padding: unset; +} +ul.blog-posts li { + display: flex; + flex-direction: column; +} +ul.blog-posts li span { + min-width: 11ch; +} + +p.byline { + opacity: 0.5; +} + +code { + font-family: ui-monospace, 'Cascadia Code', 'Source Code Pro', + Menlo, Consolas, 'DejaVu Sans Mono', monospace; + padding: 2px calc(var(--spacing) / 4); + background-color: #282a36; + font-size: calc(var(--size) * 1.4); +} +pre code { + display: block; + padding: var(--spacing); + overflow-x: auto; + -webkit-text-size-adjust: 100%; + -moz-text-size-adjust: 100%; +} + +table { + width: 100%; +} +table, +th, +td { + border: 1px solid; + border-collapse: collapse; + border-color: var(--color-light); + padding: calc(var(--spacing) / 2); +} + +.disabled { + color: currentColor; + cursor: not-allowed; + opacity: 0.5; +} + +@media screen and (min-width: 600px) { + ul.blog-posts li { + flex-direction: row; + gap: calc(var(--spacing) / 2); + } +} + +/* "Skip to main content" link */ +.skip-link { + position: absolute; + top: 5; + transform: translateY(-600%); + transition: transform 0.5s; + background-color: #181a20; + padding: 6px; +} + +.skip-link:focus { + transform: translateY(0%); +} + +figure { + margin-inline-start: 0em; + margin-inline-end: 0em; +} + +figcaption > p { + margin-block-start: 9px; + text-align: center; + font-style: italic; +} diff --git a/themes/hugo-bearcub/assets/images/social_card_bg.png b/themes/hugo-bearcub/assets/images/social_card_bg.png new file mode 100644 index 0000000..857ea63 Binary files /dev/null and b/themes/hugo-bearcub/assets/images/social_card_bg.png differ diff --git a/themes/hugo-bearcub/assets/images/social_card_fg.png b/themes/hugo-bearcub/assets/images/social_card_fg.png new file mode 100644 index 0000000..e005b0a Binary files /dev/null and b/themes/hugo-bearcub/assets/images/social_card_fg.png differ diff --git a/themes/hugo-bearcub/assets/original.css b/themes/hugo-bearcub/assets/original.css new file mode 100644 index 0000000..480cc6a --- /dev/null +++ b/themes/hugo-bearcub/assets/original.css @@ -0,0 +1,180 @@ +body { + font-family: Verdana, sans-serif; + margin: auto; + padding: 20px; + max-width: 720px; + text-align: left; + background-color: #1d1f27; + word-wrap: break-word; + overflow-wrap: break-word; + line-height: 1.5; + color: #c9d1d9; +} + +h1, +h2, +h3, +h4, +h5, +h6, +strong, +b { + color: #eee; +} + +a { + color: #8cc2dd; +} + +.title { + text-decoration: none; + border: 0; +} +.title h1 { + font-size: 24px; + margin: 19.92px 0 19.92px 0; +} + +.title span { + font-weight: 400; +} + +nav a { + margin-right: 10px; +} + +textarea { + background-color: #252525; + color: #ddd; + width: 100%; + font-size: 16px; +} + +input { + background-color: #252525; + color: #ddd; + font-size: 16px; +} + +content { + line-height: 1.6; +} + +table { + width: 100%; +} + +table, +th, +td { + border: 1px solid; + border-collapse: collapse; + border-color: #c9d1d9; + padding: 5px; +} + +img { + max-width: 100%; + height: auto; +} + +code { + padding: 2px 5px; + color: #f8f8f2; + background-color: #282a36; +} + +pre code { + display: block; + padding: 20px; + white-space: pre-wrap; + font-size: 14px; + overflow-x: auto; + text-wrap: nowrap; +} + +blockquote { + border-left: 1px solid #999; + color: #ccc; + padding-left: 20px; + font-style: italic; +} + +footer { + padding: 25px; + text-align: center; +} + +.helptext { + color: #aaa; + font-size: small; +} + +.errorlist { + color: #eba613; + font-size: small; +} + +/* blog posts */ +ul.blog-posts { + list-style-type: none; + padding: unset; +} + +ul.blog-posts li { + display: flex; + margin-bottom: 10px; +} + +ul.blog-posts li span { + flex: 0 0 130px; +} + +ul.blog-posts li a:visited { + color: #8b6fcb; +} + +a.blog-tags { + line-height: 2; + margin-right: 12px; +} + +h3.blog-filter { + margin-bottom: 0; +} + +.disabled { + color: currentColor; + cursor: not-allowed; + opacity: 0.7; +} + +p.byline { + font-style: italic; +} + +/* "Skip to main content" link */ +.skip-link { + position: absolute; + top: 5; + transform: translateY(-600%); + transition: transform 0.5s; + background-color: #1d1f27; + padding: 6px; +} + +.skip-link:focus { + transform: translateY(0%); +} + +figure { + margin-inline-start: 0em; + margin-inline-end: 0em; +} + +figcaption > p { + margin-block-start: 0px; + text-align: center; + font-style: italic; + color: #ccc; +} diff --git a/themes/hugo-bearcub/assets/syntax.css b/themes/hugo-bearcub/assets/syntax.css new file mode 100644 index 0000000..a28d967 --- /dev/null +++ b/themes/hugo-bearcub/assets/syntax.css @@ -0,0 +1,91 @@ +/* This style is a variant of Dracula that is more accessible to people with poor eyesight */ +/* See https://web.dev/color-and-contrast-accessibility/ */ +/* And https://xyproto.github.io/splash/docs/dracula.html */ +/* And https://github.com/alecthomas/chroma/blob/a40c95e447a577322e20eac58f2f7c0d026665b0/styles/dracula.xml */ + +/* Background */ .bg { color: #f8f8f2; background-color: #282a36; } +/* PreWrapper */ .chroma { color: #f8f8f2; background-color: #282a36; } +/* Other .chroma .x { } */ +/* Error .chroma .err { } */ +/* CodeLine .chroma .cl { } */ +/* LineLink */ .chroma .lnlinks { outline: none; text-decoration: none; color: inherit } +/* LineTableTD */ .chroma .lntd { vertical-align: top; padding: 0; margin: 0; border: 0; } +/* LineTable */ .chroma .lntable { border-spacing: 0; padding: 0; margin: 0; border: 0; } +/* LineHighlight */ .chroma .hl { background-color: #ffffcc } +/* LineNumbersTable */ .chroma .lnt { white-space: pre; user-select: none; margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #929292 } +/* LineNumbers */ .chroma .ln { white-space: pre; user-select: none; margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #929292 } +/* Line */ .chroma .line { display: flex; } +/* Keyword */ .chroma .k { color: #ff79c6 } +/* KeywordConstant */ .chroma .kc { color: #ff79c6 } +/* KeywordDeclaration */ .chroma .kd { color: #8be9fd; font-style: italic } +/* KeywordNamespace */ .chroma .kn { color: #ff79c6 } +/* KeywordPseudo */ .chroma .kp { color: #ff79c6 } +/* KeywordReserved */ .chroma .kr { color: #ff79c6 } +/* KeywordType */ .chroma .kt { color: #8be9fd } +/* Name .chroma .n { } */ +/* NameAttribute */ .chroma .na { color: #50fa7b } +/* NameBuiltin */ .chroma .nb { color: #8be9fd; font-style: italic } +/* NameBuiltinPseudo .chroma .bp { } */ +/* NameClass */ .chroma .nc { color: #50fa7b } +/* NameConstant .chroma .no { } */ +/* NameDecorator .chroma .nd { } */ +/* NameEntity .chroma .ni { } */ +/* NameException .chroma .ne { } */ +/* NameFunction */ .chroma .nf { color: #50fa7b } +/* NameFunctionMagic .chroma .fm { } */ +/* NameLabel */ .chroma .nl { color: #8be9fd; font-style: italic } +/* NameNamespace .chroma .nn { } */ +/* NameOther .chroma .nx { } */ +/* NameProperty .chroma .py { } */ +/* NameTag */ .chroma .nt { color: #ff79c6 } +/* NameVariable */ .chroma .nv { color: #8be9fd; font-style: italic } +/* NameVariableClass */ .chroma .vc { color: #8be9fd; font-style: italic } +/* NameVariableGlobal */ .chroma .vg { color: #8be9fd; font-style: italic } +/* NameVariableInstance */ .chroma .vi { color: #8be9fd; font-style: italic } +/* NameVariableMagic .chroma .vm { } */ +/* Literal .chroma .l { } */ +/* LiteralDate .chroma .ld { } */ +/* LiteralString */ .chroma .s { color: #f1fa8c } +/* LiteralStringAffix */ .chroma .sa { color: #f1fa8c } +/* LiteralStringBacktick */ .chroma .sb { color: #f1fa8c } +/* LiteralStringChar */ .chroma .sc { color: #f1fa8c } +/* LiteralStringDelimiter */ .chroma .dl { color: #f1fa8c } +/* LiteralStringDoc */ .chroma .sd { color: #f1fa8c } +/* LiteralStringDouble */ .chroma .s2 { color: #f1fa8c } +/* LiteralStringEscape */ .chroma .se { color: #f1fa8c } +/* LiteralStringHeredoc */ .chroma .sh { color: #f1fa8c } +/* LiteralStringInterpol */ .chroma .si { color: #f1fa8c } +/* LiteralStringOther */ .chroma .sx { color: #f1fa8c } +/* LiteralStringRegex */ .chroma .sr { color: #f1fa8c } +/* LiteralStringSingle */ .chroma .s1 { color: #f1fa8c } +/* LiteralStringSymbol */ .chroma .ss { color: #f1fa8c } +/* LiteralNumber */ .chroma .m { color: #bd93f9 } +/* LiteralNumberBin */ .chroma .mb { color: #bd93f9 } +/* LiteralNumberFloat */ .chroma .mf { color: #bd93f9 } +/* LiteralNumberHex */ .chroma .mh { color: #bd93f9 } +/* LiteralNumberInteger */ .chroma .mi { color: #bd93f9 } +/* LiteralNumberIntegerLong */ .chroma .il { color: #bd93f9 } +/* LiteralNumberOct */ .chroma .mo { color: #bd93f9 } +/* Operator */ .chroma .o { color: #ff79c6 } +/* OperatorWord */ .chroma .ow { color: #ff79c6 } +/* Punctuation .chroma .p { } */ +/* Comment */ .chroma .c { color: #8491b8 } +/* CommentHashbang */ .chroma .ch { color: #8491b8 } +/* CommentMultiline */ .chroma .cm { color: #8491b8 } +/* CommentSingle */ .chroma .c1 { color: #8491b8 } +/* CommentSpecial */ .chroma .cs { color: #8491b8 } +/* CommentPreproc */ .chroma .cp { color: #ff79c6 } +/* CommentPreprocFile */ .chroma .cpf { color: #ff79c6 } +/* Generic .chroma .g { } */ +/* GenericDeleted */ .chroma .gd { color: #ff5555 } +/* GenericEmph */ .chroma .ge { text-decoration: underline } +/* GenericError .chroma .gr { } */ +/* GenericHeading */ .chroma .gh { font-weight: bold } +/* GenericInserted */ .chroma .gi { color: #50fa7b; font-weight: bold } +/* GenericOutput */ .chroma .go { color: #44475a } +/* GenericPrompt .chroma .gp { } */ +/* GenericStrong .chroma .gs { } */ +/* GenericSubheading */ .chroma .gu { font-weight: bold } +/* GenericTraceback .chroma .gt { } */ +/* GenericUnderline */ .chroma .gl { text-decoration: underline } +/* TextWhitespace .chroma .w { } */ diff --git a/themes/hugo-bearcub/exampleSite/content.pt/_index.md b/themes/hugo-bearcub/exampleSite/content.pt/_index.md new file mode 100644 index 0000000..4cf0d7e --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content.pt/_index.md @@ -0,0 +1,35 @@ +--- +title: "Início" +menu: "main" +weight: 1 +--- + +# ᕦʕ •ᴥ•ʔᕤ Bear Cub + +Os sites de hoje em dia são pesados, lentos e cheios de scripts, propagandas e +rastreadores. Isso ficou tão comum que nós perdemos toda a perspectiva, tanto +que achamos normal uma página web ter vários megabytes. + +> The internet has become a bloated mess. Huge JavaScript libraries, countless +> client-side queries and overly complex frontend frameworks are par for the +> course these days. +> +> --- [Kev Quirk](https://512kb.club/) + +Vamos mudar isso, um site de cada vez! **Bear Cub** é um tema +[Hugo](https://gohugo.io/) baseado no [Hugo +Bear](https://github.com/janraasch/hugo-bearblog/) que toma conta da velocidade +e otimização para que você possa focar em escrever bons textos. + +Ele é gratuito, multilíngue, otimizado para buscadores, simples, responsivo, +leve e rápido. Muito rápido. + +Quando comparado ao seu predecessor, o **Bear Cub** tem alguns upgrades de +[privacidade](https://themarkup.org/blacklight?url=clente.github.io/hugo-bearcub/) +e +[acessibilidade](https://pagespeed.web.dev/report?url=https%3A%2F%2Fclente.github.io%2Fhugo-bearcub%2F). +Ele também é compatível com as práticas mais modernas de +[segurança](https://github.com/clente/hugo-bearcub#secure) para que seus +usuários possam aproveitar seu site sem medo. + +Feito com 💟 por [Caio lente](https://lente.dev). diff --git a/themes/hugo-bearcub/exampleSite/content.pt/blog/_index.md b/themes/hugo-bearcub/exampleSite/content.pt/blog/_index.md new file mode 100644 index 0000000..7588ac1 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content.pt/blog/_index.md @@ -0,0 +1,5 @@ +--- +title: "Blog" +menu: "main" +weight: 2 +--- \ No newline at end of file diff --git a/themes/hugo-bearcub/exampleSite/content/_index.md b/themes/hugo-bearcub/exampleSite/content/_index.md new file mode 100644 index 0000000..10c388f --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/_index.md @@ -0,0 +1,35 @@ +--- +title: "Home" +menu: "main" +weight: 1 +--- + +# ᕦʕ •ᴥ•ʔᕤ Bear Cub + +Today's websites are bloated, slow, and full of scripts, ads, and trackers. This +became so commonplace that we lost all sense of perspective, to the point that +we now think multi-megabyte webpages are normal. + +> The internet has become a bloated mess. Huge JavaScript libraries, countless +> client-side queries and overly complex frontend frameworks are par for the +> course these days. +> +> --- [Kev Quirk](https://512kb.club/) + +Let's change this, one website at a time! **Bear Cub** is a +[Hugo](https://gohugo.io/) theme based on [Hugo +Bear](https://github.com/janraasch/hugo-bearblog/) that takes care of speed and +optimization, so you can focus on writing good content. + +It is free, multilingual, optimized for search engines, no-nonsense, responsive, +light, and fast. Really fast. + +When compared to its predecessor, **Bear Cub** has a few +[privacy](https://themarkup.org/blacklight?url=clente.github.io/hugo-bearcub/) +and +[accessibility](https://pagespeed.web.dev/report?url=https%3A%2F%2Fclente.github.io%2Fhugo-bearcub%2F) +upgrades. It's also compatible with modern +[security](https://github.com/clente/hugo-bearcub#secure) standards, so your +users don't have to worry about browsing your website. + +Made with 💟 by [Caio lente](https://lente.dev/en). diff --git a/themes/hugo-bearcub/exampleSite/content/blog/_index.md b/themes/hugo-bearcub/exampleSite/content/blog/_index.md new file mode 100644 index 0000000..7588ac1 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/_index.md @@ -0,0 +1,5 @@ +--- +title: "Blog" +menu: "main" +weight: 2 +--- \ No newline at end of file diff --git a/themes/hugo-bearcub/exampleSite/content/blog/emoji-support.md b/themes/hugo-bearcub/exampleSite/content/blog/emoji-support.md new file mode 100644 index 0000000..f17f49a --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/emoji-support.md @@ -0,0 +1,46 @@ ++++ +author = "Hugo Authors" +title = "Emoji Support" +date = "2019-03-05" +description = "Guide to emoji usage in Hugo" +tags = [ + "emoji", +] ++++ + +Emoji can be enabled in a Hugo project in a number of ways. + +The [`emojify`](https://gohugo.io/functions/emojify/) function can be called directly in templates or [Inline Shortcodes](https://gohugo.io/templates/shortcode-templates/#inline-shortcodes). + +To enable emoji globally, set `enableEmoji` to `true` in your site's [configuration](https://gohugo.io/getting-started/configuration/) and then you can type emoji shorthand codes directly in content files; e.g. + +

🙈 :see_no_evil: 🙉 :hear_no_evil: 🙊 :speak_no_evil:

+
+ +The [Emoji cheat sheet](http://www.emoji-cheat-sheet.com/) is a useful reference for emoji shorthand codes. + +*** + +**N.B.** The above steps enable Unicode Standard emoji characters and sequences in Hugo, however the rendering of these glyphs depends on the browser and the platform. To style the emoji you can either use a third party emoji font or a font stack; e.g. + +{{< highlight html >}} +.emoji { + font-family: Apple Color Emoji, Segoe UI Emoji, NotoColorEmoji, Segoe UI Symbol, Android Emoji, EmojiSymbols; +} +{{< /highlight >}} + +{{< css.inline >}} + +{{< /css.inline >}} diff --git a/themes/hugo-bearcub/exampleSite/content/blog/markdown-syntax.md b/themes/hugo-bearcub/exampleSite/content/blog/markdown-syntax.md new file mode 100644 index 0000000..06990d7 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/markdown-syntax.md @@ -0,0 +1,148 @@ ++++ +author = "Hugo Authors" +title = "Markdown Syntax Guide" +date = "2019-03-11" +description = "Sample article showcasing basic Markdown syntax and formatting for HTML elements." +tags = [ + "markdown", + "css", + "html", +] +categories = [ + "themes", + "syntax", +] +series = ["Themes Guide"] +aliases = ["migrate-from-jekyl"] ++++ + +This article offers a sample of basic Markdown syntax that can be used in Hugo content files, also it shows whether basic HTML elements are decorated with CSS in a Hugo theme. + + +## Headings + +The following HTML `

`—`

` elements represent six levels of section headings. `

` is the highest section level while `

` is the lowest. + +# H1 +## H2 +### H3 +#### H4 +##### H5 +###### H6 + +## Paragraph + +Xerum, quo qui aut unt expliquam qui dolut labo. Aque venitatiusda cum, voluptionse latur sitiae dolessi aut parist aut dollo enim qui voluptate ma dolestendit peritin re plis aut quas inctum laceat est volestemque commosa as cus endigna tectur, offic to cor sequas etum rerum idem sintibus eiur? Quianimin porecus evelectur, cum que nis nust voloribus ratem aut omnimi, sitatur? Quiatem. Nam, omnis sum am facea corem alique molestrunt et eos evelece arcillit ut aut eos eos nus, sin conecerem erum fuga. Ri oditatquam, ad quibus unda veliamenimin cusam et facea ipsamus es exerum sitate dolores editium rerore eost, temped molorro ratiae volorro te reribus dolorer sperchicium faceata tiustia prat. + +Itatur? Quiatae cullecum rem ent aut odis in re eossequodi nonsequ idebis ne sapicia is sinveli squiatum, core et que aut hariosam ex eat. + +## Blockquotes + +The blockquote element represents content that is quoted from another source, optionally with a citation which must be within a `footer` or `cite` element, and optionally with in-line changes such as annotations and abbreviations. + +#### Blockquote without attribution + +> Tiam, ad mint andaepu dandae nostion secatur sequo quae. +> **Note** that you can use *Markdown syntax* within a blockquote. + +#### Blockquote with attribution + +> Don't communicate by sharing memory, share memory by communicating.
+> — Rob Pike[^1] + +[^1]: The above quote is excerpted from Rob Pike's [talk](https://www.youtube.com/watch?v=PAAkCSZUG1c) during Gopherfest, November 18, 2015. + +## Tables + +Tables aren't part of the core Markdown spec, but Hugo supports supports them out-of-the-box. + + Name | Age +--------|------ + Bob | 27 + Alice | 23 + +#### Inline Markdown within tables + +| Italics | Bold | Code | +| -------- | -------- | ------ | +| *italics* | **bold** | `code` | + +## Code Blocks + +#### Code block with backticks + +```html + + + + + Example HTML5 Document + + +

Test

+ + +``` + +#### Code block indented with four spaces + + + + + + Example HTML5 Document + + +

Test

+ + + +#### Code block with Hugo's internal highlight shortcode +{{< highlight html >}} + + + + + Example HTML5 Document + + +

Test

+ + +{{< /highlight >}} + +## List Types + +#### Ordered List + +1. First item +2. Second item +3. Third item + +#### Unordered List + +* List item +* Another item +* And another item + +#### Nested list + +* Fruit + * Apple + * Orange + * Banana +* Dairy + * Milk + * Cheese + +## Other Elements — abbr, sub, sup, kbd, mark + +GIF is a bitmap image format. + +H2O + +Xn + Yn = Zn + +Press CTRL+ALT+Delete to end the session. + +Most salamanders are nocturnal, and hunt for insects, worms, and other small creatures. diff --git a/themes/hugo-bearcub/exampleSite/content/blog/math-typesetting.md b/themes/hugo-bearcub/exampleSite/content/blog/math-typesetting.md new file mode 100644 index 0000000..62831a9 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/math-typesetting.md @@ -0,0 +1,49 @@ +--- +author: Hugo Authors +title: Math Typesetting +date: 2019-03-08 +description: A brief guide to setup KaTeX +math: true +--- + +Mathematical notation in a Hugo project can be enabled by using third party JavaScript libraries. + + +In this example we will be using [KaTeX](https://katex.org/) + +- Create a partial under `/layouts/partials/math.html` +- Within this partial reference the [Auto-render Extension](https://katex.org/docs/autorender.html) or host these scripts locally. +- Include the partial in your templates like so: + +```bash +{{ if or .Params.math .Site.Params.math }} +{{ partial "math.html" . }} +{{ end }} +``` + +- To enable KaTex globally set the parameter `math` to `true` in a project's configuration +- To enable KaTex on a per page basis include the parameter `math: true` in content files + +**Note:** Use the online reference of [Supported TeX Functions](https://katex.org/docs/supported.html) + +{{< math.inline >}} +{{ if or .Page.Params.math .Site.Params.math }} + + + + +{{ end }} +{{}} + +### Examples + +{{< math.inline >}} +

+Inline math: \(\varphi = \dfrac{1+\sqrt5}{2}= 1.6180339887…\) +

+{{}} + +Block math: +$$ + \varphi = 1+\frac{1} {1+\frac{1} {1+\frac{1} {1+\cdots} } } +$$ diff --git a/themes/hugo-bearcub/exampleSite/content/blog/placeholder-text.md b/themes/hugo-bearcub/exampleSite/content/blog/placeholder-text.md new file mode 100644 index 0000000..9ed5f69 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/placeholder-text.md @@ -0,0 +1,45 @@ ++++ +author = "Hugo Authors" +title = "Placeholder Text" +date = "2019-03-09" +description = "Lorem Ipsum Dolor Si Amet" +tags = [ + "markdown", + "text", +] ++++ + +Lorem est tota propiore conpellat pectoribus de pectora summo. Redit teque digerit hominumque toris verebor lumina non cervice subde tollit usus habet Arctonque, furores quas nec ferunt. Quoque montibus nunc caluere tempus inhospita parcite confusaque translucet patri vestro qui optatis lumine cognoscere flos nubis! Fronde ipsamque patulos Dryopen deorum. + +1. Exierant elisi ambit vivere dedere +2. Duce pollice +3. Eris modo +4. Spargitque ferrea quos palude + +Rursus nulli murmur; hastile inridet ut ab gravi sententia! Nomine potitus silentia flumen, sustinet placuit petis in dilapsa erat sunt. Atria tractus malis. + +1. Comas hunc haec pietate fetum procerum dixit +2. Post torum vates letum Tiresia +3. Flumen querellas +4. Arcanaque montibus omnes +5. Quidem et + +# Vagus elidunt + + + +[The Van de Graaf Canon](https://en.wikipedia.org/wiki/Canons_of_page_construction#Van_de_Graaf_canon) + +## Mane refeci capiebant unda mulcebat + +Victa caducifer, malo vulnere contra dicere aurato, ludit regale, voca! Retorsit colit est profanae esse virescere furit nec; iaculi matertera et visa est, viribus. Divesque creatis, tecta novat collumque vulnus est, parvas. **Faces illo pepulere** tempus adest. Tendit flamma, ab opes virum sustinet, sidus sequendo urbis. + +Iubar proles corpore raptos vero auctor imperium; sed et huic: manus caeli Lelegas tu lux. Verbis obstitit intus oblectamina fixis linguisque ausus sperare Echionides cornuaque tenent clausit possit. Omnia putatur. Praeteritae refert ausus; ferebant e primus lora nutat, vici quae mea ipse. Et iter nil spectatae vulnus haerentia iuste et exercebat, sui et. + +Eurytus Hector, materna ipsumque ut Politen, nec, nate, ignari, vernum cohaesit sequitur. Vel **mitis temploque** vocatus, inque alis, *oculos nomen* non silvis corpore coniunx ne displicet illa. Crescunt non unus, vidit visa quantum inmiti flumina mortis facto sic: undique a alios vincula sunt iactata abdita! Suspenderat ego fuit tendit: luna, ante urbem Propoetides **parte**. + +{{< css.inline >}} + +{{< /css.inline >}} diff --git a/themes/hugo-bearcub/exampleSite/content/blog/rich-content.md b/themes/hugo-bearcub/exampleSite/content/blog/rich-content.md new file mode 100644 index 0000000..3c406af --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/content/blog/rich-content.md @@ -0,0 +1,34 @@ ++++ +author = "Hugo Authors" +title = "Rich Content" +date = "2019-03-10" +description = "A brief description of Hugo Shortcodes" +tags = [ + "shortcodes", + "privacy", +] ++++ + +Hugo ships with several [Built-in Shortcodes](https://gohugo.io/content-management/shortcodes/#use-hugos-built-in-shortcodes) for rich content, along with a [Privacy Config](https://gohugo.io/about/hugo-and-gdpr/) and a set of Simple Shortcodes that enable static and no-JS versions of various social media embeds. + +--- + +## YouTube Privacy Enhanced Shortcode + +{{< youtube ZJthWmvUzzc >}} + +
+ +--- + +## Twitter Simple Shortcode + +{{< x user="DesignReviewed" id="1085870671291310081" >}} + +
+ +--- + +## Vimeo Simple Shortcode + +{{< vimeo_simple 48912912 >}} diff --git a/themes/hugo-bearcub/exampleSite/hugo.toml b/themes/hugo-bearcub/exampleSite/hugo.toml new file mode 100644 index 0000000..04690c0 --- /dev/null +++ b/themes/hugo-bearcub/exampleSite/hugo.toml @@ -0,0 +1,89 @@ +# Basic config +baseURL = "https://example.com" +theme = "hugo-bearcub" +copyright = "John Doe (CC BY 4.0)" +defaultContentLanguage = "en" + +# Generate a nice robots.txt for SEO +enableRobotsTXT = true + +# Setup syntax highlighting without inline styles. For more information about +# why you'd want to avoid inline styles, see +# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy/style-src#unsafe_inline_styles +[markup] + [markup.highlight] + lineNos = true + lineNumbersInTable = false + # This allows Bear Cub to use a variation of Dracula that is more accessible + # to people with poor eyesight. For more information about color contrast + # and accessibility, see https://web.dev/color-and-contrast-accessibility/ + noClasses = false + +# Multilingual mode config. More for information about how to setup translation, +# see https://gohugo.io/content-management/multilingual/ +[languages] + [languages.en] + title = "Bear Cub" + languageName = "en-US 🇺🇸" + LanguageCode = "en-US" + contentDir = "content" + [languages.en.params] + madeWith = "Made with [Bear Cub](https://github.com/clente/hugo-bearcub)" + [languages.pt] + title = "Bear Cub" + languageName = "pt-BR 🇧🇷" + LanguageCode = "pt-BR" + contentDir = "content.pt" + [languages.pt.params] + madeWith = "Feito com [Bear Cub](https://github.com/clente/hugo-bearcub)" + +[params] + # The description of your website + description = "Bear Cub Demo" + + # The path to your favicon + favicon = "images/favicon.png" + + # These images will show up when services want to generate a preview of a link + # to your site. Ignored if `generateSocialCard = true`. For more information + # about previews, see https://gohugo.io/templates/internal#twitter-cards and + # https://gohugo.io/templates/internal#open-graph + images = ["images/share.webp"] + + # This title is used as the site_name on the Hugo's internal opengraph + # structured data template + title = "Bear Cub" + + # Dates are displayed following the format below. For more information about + # formatting, see https://gohugo.io/functions/format/ + dateFormat = "2006-01-02" + + # If your blog is multilingual but you haven't translated a page, this theme + # will create a disabled link. By setting `hideUntranslated` to true, you can + # have the theme simply not show any link + hideUntranslated = false + + # (EXPERIMENTAL) This theme has two options for its CSS styles: "original" and + # "herman". The former is what you see on Bear Cub's demo (an optimized + # version of Hugo Bear Blog), while the latter has a more modern look based on + # Herman Martinus's version of the Blogster Minimal theme for Astro. + themeStyle = "original" + + # (EXPERIMENTAL) This theme is capable of dynamically generating social cards + # for posts that don't have `images` defined in their front matter; By setting + # `generateSocialCard` to false, you can prevent this behavior. For more + # information see layouts/partials/social_card.html + generateSocialCard = true + + # Social media. Delete any item you aren't using to make sure it won't show up + # in your website's metadata. + [params.social] + twitter = "example" # Twitter handle (without '@') + facebook_admin = "0000000000" # Facebook Page Admin ID + + # Author metadata. This is mostly used for the RSS feed of your site, but the + # email is also added to the footer of each post. You can hide the "reply to" + # link by using a `hideReply` param in front matter. + [params.author] + name = "John Doe" # Your name as shown in the RSS feed metadata + email = "me@example.com" # Added to the footer so readers can reply to posts diff --git a/themes/hugo-bearcub/exampleSite/static/images/favicon.png b/themes/hugo-bearcub/exampleSite/static/images/favicon.png new file mode 100644 index 0000000..c3ab3b9 Binary files /dev/null and b/themes/hugo-bearcub/exampleSite/static/images/favicon.png differ diff --git a/themes/hugo-bearcub/exampleSite/static/images/share.webp b/themes/hugo-bearcub/exampleSite/static/images/share.webp new file mode 100644 index 0000000..e190239 Binary files /dev/null and b/themes/hugo-bearcub/exampleSite/static/images/share.webp differ diff --git a/themes/hugo-bearcub/i18n/de.toml b/themes/hugo-bearcub/i18n/de.toml new file mode 100644 index 0000000..c8216d1 --- /dev/null +++ b/themes/hugo-bearcub/i18n/de.toml @@ -0,0 +1,14 @@ +[filtering-for] + other = "Filtern nach" + +[no-posts] + other = "Noch keine Beträge" + +[email-subject] + other = "Antwort auf " + +[email-reply] + other = "Auf diesen Beitrag per Email antworten " + +[skip-link] + other = "Zum Hauptinhalt" diff --git a/themes/hugo-bearcub/i18n/en.toml b/themes/hugo-bearcub/i18n/en.toml new file mode 100644 index 0000000..15a8569 --- /dev/null +++ b/themes/hugo-bearcub/i18n/en.toml @@ -0,0 +1,14 @@ +[filtering-for] + other = "Filtering for" + +[no-posts] + other = "No posts yet" + +[email-subject] + other = "Reply to " + +[email-reply] + other = "Reply to this post by email" + +[skip-link] + other = "Skip to main content" \ No newline at end of file diff --git a/themes/hugo-bearcub/i18n/ko.toml b/themes/hugo-bearcub/i18n/ko.toml new file mode 100644 index 0000000..35ac56e --- /dev/null +++ b/themes/hugo-bearcub/i18n/ko.toml @@ -0,0 +1,14 @@ +[filtering-for] + other = "태그 검색:" + +[no-posts] + other = "아직 작성된 게시물이 없습니다!" + +[email-subject] + other = "댓글: " + +[email-reply] + other = "이메일로 댓글 달기" + +[skip-link] + other = "본문으로 바로 가기" diff --git a/themes/hugo-bearcub/i18n/pt.toml b/themes/hugo-bearcub/i18n/pt.toml new file mode 100644 index 0000000..ad228ab --- /dev/null +++ b/themes/hugo-bearcub/i18n/pt.toml @@ -0,0 +1,14 @@ +[filtering-for] + other = "Filtrando para" + +[no-posts] + other = "Nenhum post ainda" + +[email-subject] + other = "Resposta a " + +[email-reply] + other = "Responda a este post por email" + +[skip-link] + other = "Pular para conteúdo principal" \ No newline at end of file diff --git a/themes/hugo-bearcub/i18n/tr.toml b/themes/hugo-bearcub/i18n/tr.toml new file mode 100644 index 0000000..a761585 --- /dev/null +++ b/themes/hugo-bearcub/i18n/tr.toml @@ -0,0 +1,14 @@ +[filtering-for] + other = "Filtrelenilen etiket" + +[no-posts] + other = "Henüz gönderi yok" + +[email-subject] + other = "Şu gönderiye yanıt: " + +[email-reply] + other = "Bu gönderiye eposta ile yanıt ver" + +[skip-link] + other = "Ana içeriğe geç" diff --git a/themes/hugo-bearcub/layouts/404.html b/themes/hugo-bearcub/layouts/404.html new file mode 100644 index 0000000..c6b3dfd --- /dev/null +++ b/themes/hugo-bearcub/layouts/404.html @@ -0,0 +1,6 @@ +{{ define "title" }}404{{ end }} + +{{ define "main" }} +

404

+

ʕノ•ᴥ•ʔノ ︵ ┻━┻

+{{ end }} diff --git a/themes/hugo-bearcub/layouts/_default/_markup/render-codeblock.html b/themes/hugo-bearcub/layouts/_default/_markup/render-codeblock.html new file mode 100644 index 0000000..6e1a075 --- /dev/null +++ b/themes/hugo-bearcub/layouts/_default/_markup/render-codeblock.html @@ -0,0 +1,6 @@ + +{{ .Page.Store.Set "hasCodeBlock" true }} + + +{{ $result := transform.HighlightCodeBlock . }} +{{ $result.Wrapped }} \ No newline at end of file diff --git a/themes/hugo-bearcub/layouts/_default/baseof.html b/themes/hugo-bearcub/layouts/_default/baseof.html new file mode 100644 index 0000000..294749a --- /dev/null +++ b/themes/hugo-bearcub/layouts/_default/baseof.html @@ -0,0 +1,54 @@ + + + + + + + + {{- partial "favicon.html" . -}} + {{- block "title" . }}{{ with .Title }}{{ . }} | {{ end }}{{ .Site.Title }}{{- end }} + + {{- partial "seo_tags.html" . -}} + + + {{ $style := print (default "original" .Site.Params.themeStyle) ".css" | resources.Get | minify }} + + + {{ if (.Page.Store.Get "hasCodeBlock") }} + {{ $syntax := resources.Get "syntax.css" | minify }} + + {{ end }} + + {{ with .Params.style }} + {{ $extra := resources.Get . | minify }} + + {{ end }} + + {{ with .OutputFormats.Get "rss" -}} + {{ printf `` .Rel .MediaType.Type .Permalink $.Site.Title | safeHTML }} + {{ end -}} + + + {{- partial "custom_head.html" . -}} + + + +
+ {{- partial "header.html" . -}} +
+
+ {{- block "main" . }}{{- end }} +
+
+ {{- partial "footer.html" . -}} +
+ + + {{- partial "custom_body.html" . -}} + + + diff --git a/themes/hugo-bearcub/layouts/_default/list.html b/themes/hugo-bearcub/layouts/_default/list.html new file mode 100644 index 0000000..e6cc177 --- /dev/null +++ b/themes/hugo-bearcub/layouts/_default/list.html @@ -0,0 +1,36 @@ +{{ define "main" }} + + {{ if .Data.Singular }} +

{{ i18n "filtering-for" }} "{{ .Title }}"

+ {{ end }} +
    + {{ range .Pages }} +
  • + + + + + + {{ if .Params.link }} + {{ .Title }} ↪ + {{ else }} + {{ .Title }} + {{ end }} +
  • + {{ else }} +
  • + {{ i18n "no-posts" }} +
  • + {{ end }} +
+ {{ if not .Data.Singular }} +
+ {{ range .Site.Taxonomies.tags }} + #{{ lower .Page.Title }} + {{ end }} +
+ {{ end }} +
+{{ end }} diff --git a/themes/hugo-bearcub/layouts/_default/rss.xml b/themes/hugo-bearcub/layouts/_default/rss.xml new file mode 100644 index 0000000..70d1193 --- /dev/null +++ b/themes/hugo-bearcub/layouts/_default/rss.xml @@ -0,0 +1,72 @@ +{{- /* Deprecate site.Author.email in favor of site.Params.author.email */}} +{{- $authorEmail := "" }} +{{- with site.Params.author }} + {{- if reflect.IsMap . }} + {{- with .email }} + {{- $authorEmail = . }} + {{- end }} + {{- end }} +{{- else }} + {{- with site.Author.email }} + {{- $authorEmail = . }} + {{- warnf "The author key in site configuration is deprecated. Use params.author.email instead." }} + {{- end }} +{{- end }} + +{{- /* Deprecate site.Author.name in favor of site.Params.author.name */}} +{{- $authorName := "" }} +{{- with site.Params.author }} + {{- if reflect.IsMap . }} + {{- with .name }} + {{- $authorName = . }} + {{- end }} + {{- else }} + {{- $authorName = . }} + {{- end }} +{{- else }} + {{- with site.Author.name }} + {{- $authorName = . }} + {{- warnf "The author key in site configuration is deprecated. Use params.author.name instead." }} + {{- end }} +{{- end }} + +{{- $pctx := . }} +{{- if .IsHome }}{{ $pctx = .Site }}{{ end }} +{{- $pages := slice }} +{{- if or $.IsHome $.IsSection }} +{{- $pages = $pctx.RegularPages }} +{{- else }} +{{- $pages = $pctx.Pages }} +{{- end }} +{{- $limit := .Site.Config.Services.RSS.Limit }} +{{- if ge $limit 1 }} +{{- $pages = $pages | first $limit }} +{{- end }} +{{- printf "" | safeHTML }} + + + {{ if .IsHome }}{{ .Site.Title }}{{ else }}{{ with .Title }}{{ . }} on {{ end }}{{ .Site.Title }}{{ end }} + {{ .Permalink }} + Recent content {{ if not .IsHome }}{{ with .Title }}in {{ . }} {{ end }}{{ end }}on {{ .Site.Title }} + Hugo -- gohugo.io + {{ site.Language.LanguageCode }}{{ with $authorEmail }} + {{.}}{{ with $authorName }} ({{ . }}){{ end }}{{ end }}{{ with $authorEmail }} + {{ . }}{{ with $authorName }} ({{ . }}){{ end }}{{ end }}{{ with .Site.Copyright }} + {{ . }}{{ end }}{{ if not .Date.IsZero }} + {{ .Date.Format "Mon, 02 Jan 2006 15:04:05 -0700" | safeHTML }}{{ end }} + {{- with .OutputFormats.Get "RSS" }} + {{ printf "" .Permalink .MediaType | safeHTML }} + {{- end }} + {{- range $pages }} + + {{ .Title }} + {{ .Permalink }} + {{ .Date.Format "Mon, 02 Jan 2006 15:04:05 -0700" | safeHTML }} + {{- with $authorEmail }}{{ . }}{{ with $authorName }} ({{ . }}){{ end }}{{ end }} + {{ .Permalink }} + {{ .Summary | transform.XMLEscape | safeHTML }} + {{ `` | safeHTML }} + + {{- end }} + + diff --git a/themes/hugo-bearcub/layouts/_default/single.html b/themes/hugo-bearcub/layouts/_default/single.html new file mode 100644 index 0000000..2baecc4 --- /dev/null +++ b/themes/hugo-bearcub/layouts/_default/single.html @@ -0,0 +1,28 @@ +{{ define "main" }} +{{ if not .Params.menu }} +

{{ .Title }}

+ +{{ end }} + + {{ .Content }} + +

+ {{ range (.GetTerms "tags") }} + #{{ lower .LinkTitle }} + {{ end }} +

+{{ if not .Params.hideReply }} +{{ with .Site.Params.author.email }} +

+ + {{ i18n "email-reply" }} ↪ + +

+{{ end }} +{{ end }} +{{ end }} diff --git a/themes/hugo-bearcub/layouts/index.html b/themes/hugo-bearcub/layouts/index.html new file mode 100644 index 0000000..9983b08 --- /dev/null +++ b/themes/hugo-bearcub/layouts/index.html @@ -0,0 +1,3 @@ +{{ define "main" }} +{{ .Content }} +{{ end }} diff --git a/themes/hugo-bearcub/layouts/partials/custom_body.html b/themes/hugo-bearcub/layouts/partials/custom_body.html new file mode 100644 index 0000000..951fb15 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/custom_body.html @@ -0,0 +1,3 @@ + diff --git a/themes/hugo-bearcub/layouts/partials/custom_head.html b/themes/hugo-bearcub/layouts/partials/custom_head.html new file mode 100644 index 0000000..4c53c40 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/custom_head.html @@ -0,0 +1,3 @@ + diff --git a/themes/hugo-bearcub/layouts/partials/favicon.html b/themes/hugo-bearcub/layouts/partials/favicon.html new file mode 100644 index 0000000..d391a86 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/favicon.html @@ -0,0 +1,2 @@ +{{ with .Site.Params.favicon }} +{{ end }} diff --git a/themes/hugo-bearcub/layouts/partials/footer.html b/themes/hugo-bearcub/layouts/partials/footer.html new file mode 100644 index 0000000..9290715 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/footer.html @@ -0,0 +1,3 @@ + + {{ .Site.Copyright }} | {{ markdownify .Site.Params.madeWith }} + \ No newline at end of file diff --git a/themes/hugo-bearcub/layouts/partials/header.html b/themes/hugo-bearcub/layouts/partials/header.html new file mode 100644 index 0000000..42b5c10 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/header.html @@ -0,0 +1,4 @@ + + +

{{ .Site.Title }}

+ diff --git a/themes/hugo-bearcub/layouts/partials/nav.html b/themes/hugo-bearcub/layouts/partials/nav.html new file mode 100644 index 0000000..b9400df --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/nav.html @@ -0,0 +1,22 @@ +{{ range .Site.Menus.main.ByWeight }} + {{ .Name }} +{{ end }} +rss + + +{{ $translations := dict }} +{{ range .Translations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} + + +{{ range where .Site.Languages "Lang" "!=" .Page.Lang }} + {{ with (index $translations .Lang) }} + {{ .Language.LanguageName }} + {{ else }} + + {{ if not .Params.hideUntranslated }} + {{ .LanguageName }} + {{ end }} + {{ end }} +{{ end }} diff --git a/themes/hugo-bearcub/layouts/partials/seo_tags.html b/themes/hugo-bearcub/layouts/partials/seo_tags.html new file mode 100644 index 0000000..6ab1422 --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/seo_tags.html @@ -0,0 +1,22 @@ + + + + + + + + + + +{{ if and (and (default false .Site.Params.generateSocialCard) (not (isset .Params "images"))) (eq .Kind "page") }} + {{ partial "social_card.html" . }} +{{ else }} + + {{ template "_internal/opengraph.html" . }} + + + {{ template "_internal/twitter_cards.html" . }} + + + {{ template "_internal/schema.html" . }} +{{ end }} diff --git a/themes/hugo-bearcub/layouts/partials/social_card.html b/themes/hugo-bearcub/layouts/partials/social_card.html new file mode 100644 index 0000000..4f89b4f --- /dev/null +++ b/themes/hugo-bearcub/layouts/partials/social_card.html @@ -0,0 +1,124 @@ + +{{ $font := resources.GetRemote "https://github.com/google/fonts/raw/main/ofl/firamono/FiraMono-Bold.ttf" }} +{{ $fg := resources.Get "images/social_card_fg.png"}} +{{ $bg := resources.Get "images/social_card_bg.png"}} + +{{ if gt (len .Title) 45 }} + {{ $fg = $fg.Filter (images.Text .Title (dict + "font" $font + "color" "#fafafa" + "size" 95 + "linespacing" 16 + "x" 0 + "y" 0 + )) }} +{{ else }} + {{ $fg = $fg.Filter (images.Text .Title (dict + "font" $font + "color" "#fafafa" + "size" 130 + "linespacing" 20 + "x" 0 + "y" 0 + )) }} +{{ end }} + +{{ $date := .Date.Format (default "2006-01-02" .Site.Params.dateFormat) }} +{{ $author := (default $.Site.Params.author.name ($.Param "author") ) }} +{{ $byline := (printf "%s | %s" $author $date) }} + +{{ $fg = $fg.Filter (images.Text $byline (dict + "font" $font + "color" "#898a8d" + "size" 60 + "linespacing" 30 + "x" 0 + "y" 425 +)) }} + +{{ $card := $bg.Filter (images.Overlay $fg 112 140 ) }} +{{ $card := $card.Resize "900x webp q100" }} + + + + + + + + + + +{{- if .IsPage }} +{{- $iso8601 := "2006-01-02T15:04:05-07:00" -}} + +{{ with .PublishDate }}{{ end }} +{{ with .Lastmod }}{{ end }} +{{- end -}} + +{{- with .Params.audio }}{{ end }} +{{- with .Params.locale }}{{ end }} +{{- with .Site.Params.title }}{{ end }} +{{- with .Params.videos }}{{- range . }} + +{{ end }}{{ end }} + +{{- /* If it is part of a series, link to related articles */}} +{{- $permalink := .Permalink }} +{{- $siteSeries := .Site.Taxonomies.series }} +{{- if $siteSeries }} +{{ with .Params.series }}{{- range $name := . }} + {{- $series := index $siteSeries ($name | urlize) }} + {{- range $page := first 6 $series.Pages }} + {{- if ne $page.Permalink $permalink }}{{ end }} + {{- end }} +{{ end }}{{ end }} +{{- end }} + +{{- $facebookAdmin := "" }} +{{- with site.Params.social }} + {{- if reflect.IsMap . }} + {{- $facebookAdmin = .facebook_admin }} + {{- end }} +{{- end }} + +{{- /* Facebook Page Admin ID for Domain Insights */}} +{{ with $facebookAdmin }}{{ end }} + + + + + + + + +{{- $twitterSite := "" }} +{{- with site.Params.social }} + {{- if reflect.IsMap . }} + {{- $twitterSite = .twitter }} + {{- end }} +{{- end }} + +{{- with $twitterSite }} + {{- $content := . }} + {{- if not (strings.HasPrefix . "@") }} + {{- $content = printf "@%v" $twitterSite }} + {{- end }} + +{{- end }} + + + + + + +{{- if .IsPage -}} +{{- $iso8601 := "2006-01-02T15:04:05-07:00" -}} +{{ with .PublishDate }}{{ end}} +{{ with .Lastmod }}{{ end}} + + + + + + +{{- end -}} diff --git a/themes/hugo-bearcub/layouts/robots.txt b/themes/hugo-bearcub/layouts/robots.txt new file mode 100644 index 0000000..4f4ca44 --- /dev/null +++ b/themes/hugo-bearcub/layouts/robots.txt @@ -0,0 +1,3 @@ +User-agent: * +Allow: / +Sitemap: {{ "sitemap.xml" | absURL }} diff --git a/themes/hugo-bearcub/layouts/shortcodes/absfigure.html b/themes/hugo-bearcub/layouts/shortcodes/absfigure.html new file mode 100644 index 0000000..ad60c6c --- /dev/null +++ b/themes/hugo-bearcub/layouts/shortcodes/absfigure.html @@ -0,0 +1,29 @@ + + {{- if .Get "link" -}} + + {{- end -}} + {{ with .Get + {{- if .Get "link" }}{{ end -}} + {{- if or (or (.Get "title") (.Get "caption")) (.Get "attr") -}} +
+ {{ with (.Get "title") -}} +

{{ . }}

+ {{- end -}} + {{- if or (.Get "caption") (.Get "attr") -}}

+ {{- .Get "caption" | markdownify -}} + {{- with .Get "attrlink" }} + + {{- end -}} + {{- .Get "attr" | markdownify -}} + {{- if .Get "attrlink" }}{{ end }}

+ {{- end }} +
+ {{- end }} + \ No newline at end of file diff --git a/themes/hugo-bearcub/layouts/shortcodes/highlight.html b/themes/hugo-bearcub/layouts/shortcodes/highlight.html new file mode 100644 index 0000000..6c86ca8 --- /dev/null +++ b/themes/hugo-bearcub/layouts/shortcodes/highlight.html @@ -0,0 +1,5 @@ + +{{ .Page.Store.Set "hasCodeBlock" true }} + + +{{ if len .Params | eq 2 }}{{ highlight (trim .InnerDeindent "\n\r") (.Get 0) (.Get 1) }}{{ else }}{{ highlight (trim .InnerDeindent "\n\r") (.Get 0) "" }}{{ end }} \ No newline at end of file diff --git a/themes/hugo-bearcub/layouts/shortcodes/rawhtml.html b/themes/hugo-bearcub/layouts/shortcodes/rawhtml.html new file mode 100644 index 0000000..520ec17 --- /dev/null +++ b/themes/hugo-bearcub/layouts/shortcodes/rawhtml.html @@ -0,0 +1,2 @@ + +{{.Inner}} \ No newline at end of file diff --git a/themes/hugo-bearcub/theme.toml b/themes/hugo-bearcub/theme.toml new file mode 100644 index 0000000..602a351 --- /dev/null +++ b/themes/hugo-bearcub/theme.toml @@ -0,0 +1,29 @@ +name = "Bear Cub" +license = "MIT" +licenselink = "https://github.com/clente/hugo-bearcub/blob/master/LICENSE" +description = "A lightweight Hugo theme based on Bear Blog and Hugo Bear Blog. It is free, multilingual, optimized for search engines, no-nonsense, responsive, light, and fast. Really fast." + +# The home page of the theme, where the source can be found. +homepage = "https://github.com/clente/hugo-bearcub" + +# If you have a running demo of the theme. +demosite = "https://clente.github.io/hugo-bearcub" + +tags = ["blog", "responsive", "minimal", "personal", "dark", "multilingual"] +features = ["favicon", "seo", "no javascript", "rss", "social cards"] + +# If the theme has a single author +[author] + name = "Caio Lente" + homepage = "https://lente.dev" + +# If porting an existing theme +[original] + author = "Jan Raasch" + homepage = "https://www.janraasch.com" + repo = "https://github.com/janraasch/hugo-bearblog" + +# Hugo versions the theme supports +[module] + [module.hugoVersion] + min = "0.90"