Switch to Hugo

This commit is contained in:
Avinash Mallya
2025-09-13 21:27:23 -05:00
parent 1b4b1ad933
commit 57eff46d6c
150 changed files with 12296 additions and 467 deletions

View File

@@ -1,25 +0,0 @@
---
permalink: /404.html
layout: default
---
<style type="text/css" media="screen">
.container {
margin: 10px auto;
max-width: 600px;
text-align: center;
}
h1 {
margin: 30px 0;
font-size: 4em;
line-height: 1;
letter-spacing: -1px;
}
</style>
<div class="container">
<h1>404</h1>
<p><strong>Page not found :(</strong></p>
<p>The requested page could not be found.</p>
</div>

35
Gemfile
View File

@@ -1,35 +0,0 @@
source "https://rubygems.org"
# Hello! This is where you manage which Jekyll version is used to run.
# When you want to use a different version, change it below, save the
# file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
#
# bundle exec jekyll serve
#
# This will help ensure the proper Jekyll version is running.
# Happy Jekylling!
# gem "jekyll", "~> 4.3.2"
# This is the default theme for new Jekyll sites. You may change this to anything you like.
# gem "hacker", "~> 0.2.0"
# If you want to use GitHub Pages, remove the "gem "jekyll"" above and
# uncomment the line below. To upgrade, run `bundle update github-pages`.
# If you have any plugins, put them here!
group :jekyll_plugins do
gem "jekyll-feed", "~> 0.12"
gem "github-pages", "~> 228"
end
# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
# and associated library.
platforms :mingw, :x64_mingw, :mswin, :jruby do
gem "tzinfo", ">= 1", "< 3"
gem "tzinfo-data"
end
# Performance-booster for watching directories on Windows
gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin]
# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem
# do not have a Java counterpart.
gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby]
gem "webrick", "~> 1.8"

View File

@@ -1,265 +0,0 @@
GEM
remote: https://rubygems.org/
specs:
activesupport (7.0.5)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 1.6, < 2)
minitest (>= 5.1)
tzinfo (~> 2.0)
addressable (2.8.4)
public_suffix (>= 2.0.2, < 6.0)
coffee-script (2.4.1)
coffee-script-source
execjs
coffee-script-source (1.11.1)
colorator (1.1.0)
commonmarker (0.23.9)
concurrent-ruby (1.2.2)
dnsruby (1.70.0)
simpleidn (~> 0.2.1)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
ethon (0.16.0)
ffi (>= 1.15.0)
eventmachine (1.2.7)
execjs (2.8.1)
faraday (2.7.7)
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-net_http (3.0.2)
ffi (1.15.5)
forwardable-extended (2.6.0)
gemoji (3.0.1)
github-pages (228)
github-pages-health-check (= 1.17.9)
jekyll (= 3.9.3)
jekyll-avatar (= 0.7.0)
jekyll-coffeescript (= 1.1.1)
jekyll-commonmark-ghpages (= 0.4.0)
jekyll-default-layout (= 0.1.4)
jekyll-feed (= 0.15.1)
jekyll-gist (= 1.5.0)
jekyll-github-metadata (= 2.13.0)
jekyll-include-cache (= 0.2.1)
jekyll-mentions (= 1.6.0)
jekyll-optional-front-matter (= 0.3.2)
jekyll-paginate (= 1.1.0)
jekyll-readme-index (= 0.3.0)
jekyll-redirect-from (= 0.16.0)
jekyll-relative-links (= 0.6.1)
jekyll-remote-theme (= 0.4.3)
jekyll-sass-converter (= 1.5.2)
jekyll-seo-tag (= 2.8.0)
jekyll-sitemap (= 1.4.0)
jekyll-swiss (= 1.0.0)
jekyll-theme-architect (= 0.2.0)
jekyll-theme-cayman (= 0.2.0)
jekyll-theme-dinky (= 0.2.0)
jekyll-theme-hacker (= 0.2.0)
jekyll-theme-leap-day (= 0.2.0)
jekyll-theme-merlot (= 0.2.0)
jekyll-theme-midnight (= 0.2.0)
jekyll-theme-minimal (= 0.2.0)
jekyll-theme-modernist (= 0.2.0)
jekyll-theme-primer (= 0.6.0)
jekyll-theme-slate (= 0.2.0)
jekyll-theme-tactile (= 0.2.0)
jekyll-theme-time-machine (= 0.2.0)
jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.12.0)
kramdown (= 2.3.2)
kramdown-parser-gfm (= 1.1.0)
liquid (= 4.0.4)
mercenary (~> 0.3)
minima (= 2.5.1)
nokogiri (>= 1.13.6, < 2.0)
rouge (= 3.26.0)
terminal-table (~> 1.4)
github-pages-health-check (1.17.9)
addressable (~> 2.3)
dnsruby (~> 1.60)
octokit (~> 4.0)
public_suffix (>= 3.0, < 5.0)
typhoeus (~> 1.3)
html-pipeline (2.14.3)
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.8.0)
i18n (1.14.1)
concurrent-ruby (~> 1.0)
jekyll (3.9.3)
addressable (~> 2.4)
colorator (~> 1.0)
em-websocket (~> 0.5)
i18n (>= 0.7, < 2)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3)
liquid (~> 4.0)
mercenary (~> 0.3.3)
pathutil (~> 0.9)
rouge (>= 1.7, < 4)
safe_yaml (~> 1.0)
jekyll-avatar (0.7.0)
jekyll (>= 3.0, < 5.0)
jekyll-coffeescript (1.1.1)
coffee-script (~> 2.2)
coffee-script-source (~> 1.11.1)
jekyll-commonmark (1.4.0)
commonmarker (~> 0.22)
jekyll-commonmark-ghpages (0.4.0)
commonmarker (~> 0.23.7)
jekyll (~> 3.9.0)
jekyll-commonmark (~> 1.4.0)
rouge (>= 2.0, < 5.0)
jekyll-default-layout (0.1.4)
jekyll (~> 3.0)
jekyll-feed (0.15.1)
jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0)
octokit (~> 4.2)
jekyll-github-metadata (2.13.0)
jekyll (>= 3.4, < 5.0)
octokit (~> 4.0, != 4.4.0)
jekyll-include-cache (0.2.1)
jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.6.0)
html-pipeline (~> 2.3)
jekyll (>= 3.7, < 5.0)
jekyll-optional-front-matter (0.3.2)
jekyll (>= 3.0, < 5.0)
jekyll-paginate (1.1.0)
jekyll-readme-index (0.3.0)
jekyll (>= 3.0, < 5.0)
jekyll-redirect-from (0.16.0)
jekyll (>= 3.3, < 5.0)
jekyll-relative-links (0.6.1)
jekyll (>= 3.3, < 5.0)
jekyll-remote-theme (0.4.3)
addressable (~> 2.0)
jekyll (>= 3.5, < 5.0)
jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
rubyzip (>= 1.3.0, < 3.0)
jekyll-sass-converter (1.5.2)
sass (~> 3.4)
jekyll-seo-tag (2.8.0)
jekyll (>= 3.8, < 5.0)
jekyll-sitemap (1.4.0)
jekyll (>= 3.7, < 5.0)
jekyll-swiss (1.0.0)
jekyll-theme-architect (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-cayman (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-dinky (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-hacker (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-leap-day (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-merlot (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-midnight (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-minimal (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-modernist (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-primer (0.6.0)
jekyll (> 3.5, < 5.0)
jekyll-github-metadata (~> 2.9)
jekyll-seo-tag (~> 2.0)
jekyll-theme-slate (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-tactile (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-time-machine (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-titles-from-headings (0.5.3)
jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
jemoji (0.12.0)
gemoji (~> 3.0)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
kramdown (2.3.2)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
liquid (4.0.4)
listen (3.8.0)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
mercenary (0.3.6)
minima (2.5.1)
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
minitest (5.18.1)
nokogiri (1.15.2-arm64-darwin)
racc (~> 1.4)
octokit (4.25.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
public_suffix (4.0.7)
racc (1.7.1)
rb-fsevent (0.11.2)
rb-inotify (0.10.1)
ffi (~> 1.0)
rexml (3.2.5)
rouge (3.26.0)
ruby2_keywords (0.0.5)
rubyzip (2.3.2)
safe_yaml (1.0.5)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sawyer (0.9.2)
addressable (>= 2.3.5)
faraday (>= 0.17.3, < 3)
simpleidn (0.2.1)
unf (~> 0.1.4)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
typhoeus (1.4.0)
ethon (>= 0.9.0)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.2)
unicode-display_width (1.8.0)
webrick (1.8.1)
PLATFORMS
arm64-darwin-22
DEPENDENCIES
github-pages (~> 228)
http_parser.rb (~> 0.6.0)
jekyll-feed (~> 0.12)
tzinfo (>= 1, < 3)
tzinfo-data
wdm (~> 0.1.1)
webrick (~> 1.8)
BUNDLED WITH
2.4.14

21
LICENSE
View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2023 avimallu
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,3 +0,0 @@
# What is this repo?
This is the source code for my personal website. You can visit it [here](https://avimallu.github.io/).

View File

@@ -1,56 +0,0 @@
# Welcome to Jekyll!
#
# This config file is meant for settings that affect your whole blog, values
# which you are expected to set up once and rarely edit after that. If you find
# yourself editing this file very often, consider using Jekyll's data files
# feature for the data you need to update frequently.
#
# For technical reasons, this file is *NOT* reloaded automatically when you use
# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
#
# If you need help with YAML syntax, here are some quick references for you:
# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml
# https://learnxinyminutes.com/docs/yaml/
#
# Site settings
# These are used to personalize your new site. If you look in the HTML files,
# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
# You can create any custom variable you would like, and they will be accessible
# in the templates via {{ site.myvariable }}.
title: Avinash's Blog
email: avimallu@avimallu
show_downloads: False
description: >- # this means to ignore newlines until "baseurl:"
Avinash's personal blog.
baseurl: "" # the subpath of your site, e.g. /blog
url: "" # the base hostname & protocol for your site, e.g. http://example.com
domain: avimallu.github.io
url: https://avimallu.github.io
linkedin_username: avinash-mallya
github_username: avimallu
# Build settings
remote_theme: pages-themes/hacker@v0.2.0
plugins:
- jekyll-remote-theme
# Exclude from processing.
# The following items will not be processed, by default.
# Any item listed under the `exclude:` key here will be automatically added to
# the internal "default list".
#
# Excluded items can be processed by explicitly listing the directories or
# their entries' file path in the `include:` list.
#
# exclude:
# - .sass-cache/
# - .jekyll-cache/
# - gemfiles/
# - Gemfile
# - Gemfile.lock
# - node_modules/
# - vendor/bundle/
# - vendor/cache/
# - vendor/gems/
# - vendor/ruby/

View File

@@ -1,12 +0,0 @@
<!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
<!-- Setup theme-color -->
{% include head-custom-theme-colors.html %}
<!-- Setup Google Analytics -->
{% include head-custom-google-analytics.html %}
<!-- You can set your favicon here -->
<!-- link rel="shortcut icon" type="image/x-icon" href="{{ '/favicon.ico' | relative_url }}" -->
<!-- end custom head snippets -->

View File

@@ -1,12 +0,0 @@
---
title: About Me
permalink: /about_me
---
Hi there!
My name is Avinash Mallya (pronounced Uh-vin-ash Mul-yeah), and I'm a Data Scientist at [WISEcode](https://www.wisecode.ai/). This is my personal blog where I post about some creative ways that I've solved some complex problems in my career, usually with a solid amount of code to make sure that it's helpful for you.
In my free time, I'm involved in helping folks out at my favourite open-source package repositories, namely [Polars](https://github.com/pola-rs/polars/) and [`data.table`](https://github.com/Rdatatable/data.table). In fact - I've written [some parts](https://pola-rs.github.io/polars-book/user-guide/) of the Polars user-guide.
You can connect with me on [LinkedIn](https://www.linkedin.com/in/avinash-mallya), or [Github](https://github.com/avimallu). The source code for this website can be found at its [repo on my Github](https://github.com/avimallu/avimallu.github.io) as well. You'll also find source code in the form of text files, Jupyter Notebooks, or R Markdown files on my Github profile.

5
archetypes/default.md Normal file
View File

@@ -0,0 +1,5 @@
+++
date = '{{ .Date }}'
draft = true
title = '{{ replace .File.ContentBaseName "-" " " | title }}'
+++

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

187
assets/original.css Normal file
View File

@@ -0,0 +1,187 @@
code {
text-size-adjust: 100%;
-ms-text-size-adjust: 100%;
-moz-text-size-adjust: 100%;
-webkit-text-size-adjust: 100%;
}
body {
font-family: Verdana, sans-serif;
margin: auto;
padding: 20px;
max-width: 720px;
text-align: left;
background-color: #1d1f27;
word-wrap: break-word;
overflow-wrap: break-word;
line-height: 1.5;
color: #c9d1d9;
}
h1,
h2,
h3,
h4,
h5,
h6,
strong,
b {
color: #eee;
}
a {
color: #8cc2dd;
}
.title {
text-decoration: none;
border: 0;
}
.title h1 {
font-size: 24px;
margin: 19.92px 0 19.92px 0;
}
.title span {
font-weight: 400;
}
nav a {
margin-right: 10px;
}
textarea {
background-color: #252525;
color: #ddd;
width: 100%;
font-size: 16px;
}
input {
background-color: #252525;
color: #ddd;
font-size: 16px;
}
content {
line-height: 1.6;
}
table {
width: 100%;
}
table,
th,
td {
border: 1px solid;
border-collapse: collapse;
border-color: #c9d1d9;
padding: 5px;
}
img {
max-width: 100%;
height: auto;
}
code {
padding: 2px 5px;
color: #f8f8f2;
background-color: #282a36;
}
pre code {
display: block;
padding: 20px;
white-space: pre-wrap;
font-size: 14px;
overflow-x: auto;
text-wrap: nowrap;
}
blockquote {
border-left: 1px solid #999;
color: #ccc;
padding-left: 20px;
font-style: italic;
}
footer {
padding: 25px;
text-align: center;
}
.helptext {
color: #aaa;
font-size: small;
}
.errorlist {
color: #eba613;
font-size: small;
}
/* blog posts */
ul.blog-posts {
list-style-type: none;
padding: unset;
}
ul.blog-posts li {
display: flex;
margin-bottom: 10px;
}
ul.blog-posts li span {
flex: 0 0 130px;
}
ul.blog-posts li a:visited {
color: #8b6fcb;
}
a.blog-tags {
line-height: 2;
margin-right: 12px;
}
h3.blog-filter {
margin-bottom: 0;
}
.disabled {
color: currentColor;
cursor: not-allowed;
opacity: 0.7;
}
p.byline {
font-style: italic;
}
/* "Skip to main content" link */
.skip-link {
position: absolute;
top: 5;
transform: translateY(-600%);
transition: transform 0.5s;
background-color: #1d1f27;
padding: 6px;
}
.skip-link:focus {
transform: translateY(0%);
}
figure {
margin-inline-start: 0em;
margin-inline-end: 0em;
}
figcaption > p {
margin-block-start: 0px;
text-align: center;
font-style: italic;
color: #ccc;
}

View File

@@ -1 +0,0 @@
bundle exec jekyll serve

26
content/_index.md Normal file
View File

@@ -0,0 +1,26 @@
---
title: "about"
menu: "main"
weight: 1
---
# Hi there!
My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and I'm a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.
# What's here?
You'll find the following:
* A few posts where I show up some creative ways that I've solved complex problems.
* Links to projects that I've worked on, or have contributed to.
* An assortment of random things I've found interesting.
# Contact
You can find me on:
* [LinkedIn](https://www.linkedin.com/in/avinash-mallya)
* [Github](https://github.com/avimallu)
Please reach out via one of the above if you want to talk.

View File

@@ -0,0 +1,495 @@
+++
date = '2023-06-22'
draft = false
title = 'Overlap Joins: Number of docker trucks in an interval'
+++
# Premise
I stumbled upon an interesting [Stackoverflow question](https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period) that was linked [via an issue](https://github.com/pola-rs/polars/issues/9467) on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
I'm more of a right-tool-for-the-job person, so I tried to find a better solution.
# Problem Statement
Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck's ID.
```py
import polars as pl # if you don't have polars, run
# pip install 'polars[all]'
data = pl.from_repr("""
┌─────────────────────┬─────────────────────┬─────┐
│ arrival_time ┆ departure_time ┆ ID │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ datetime[μs] ┆ str │
╞═════════════════════╪═════════════════════╪═════╡
│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
└─────────────────────┴─────────────────────┴─────┘
""")
```
We want to identify the number of trucks docked at any given time within a threshold of 1 minute *prior* to the arrival time of a truck, and 1 minute *after* the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.
# Finding a solution to the problem
## Evaluate for a specific row
Before we find a general solution to this problem, let's consider a specific row to understand the problem better:
```py
"""
┌─────────────────────┬─────────────────────┬─────┐
│ arrival_time ┆ departure_time ┆ ID │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ datetime[μs] ┆ str │
╞═════════════════════╪═════════════════════╪═════╡
│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
└─────────────────────┴─────────────────────┴─────┘
"""
```
For this row, we need to find the number of trucks that are there between `2023-01-01 06:31:06` (1 minute prior to the `arrival_time` and `2023-01-01 06:34:48` (1 minute post the `departure_time`). Manually going through the original dataset, we see that `B3`, `C3`, `A6` and `A5` are the truck IDs that qualify - they all are at the station in a duration that is between `2023-01-01 06:31:06` and `2023-01-01 06:34:48`.
## Visually deriving an algorithm
There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap *window* relative to the arrival and departure times):
![The five different ways a period can overlap.](overlap_algorithm.png)
Take some time to absorb these cases - it's important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.
## Writing an SQL query based on the algorithm
In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It's often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn't quite in this case.
Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).
### Introducing the DuckDB package
Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that [DuckDB](https://duckdb.org/) provides:
1. no expensive set-up time (meaning no need for setting up databases, even temporary ones),
2. no dependencies (other than DuckDB itself, just `pip install duckdb`),
3. some very [friendly SQL extensions](https://duckdb.org/2022/05/04/friendlier-sql.html), and
4. ability to work directly on Polars and Pandas DataFrames without conversions
all with [mind-blowing speed](https://duckdblabs.github.io/db-benchmark/) that stands shoulder-to-shoulder with Polars. We'll also use a few advanced SQL concepts noted below.
#### Self-joins
This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.
#### A bullet train recap of non-equi joins
A key concept that we'll use is the idea of joining on a *range* of values rather than a specific value. That is, instead of the usual `LEFT JOIN ON A.column = B.column`, we can do `LEFT JOIN ON A.column <= B.column` for one row in table `A` to match to multiple rows in `B`. DuckDB has a [blog post](https://duckdb.org/2022/05/27/iejoin.html) that outlines this join in detail, including fast implementation.
#### The concept of `LIST` columns
DuckDB has first class support for `LIST` columns - that is, each row in a `LIST` column can have a varying length (much like a Python `list`), but must have the exact same datatype (like R's `vector`). Using list columns allow us to eschew the use of an additional `GROUP BY` operation on top of a `WHERE` filter or `SELECT DISTINCT` operation, since we can directly perform those on the `LIST` column itself.
#### Date algebra
Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - [lubridate](https://lubridate.tidyverse.org/) from the [tidyverse](https://www.tidyverse.org/) is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying `INTERVAL`s (a special data type that represent a period of time independent of specific time values) to modify `TIMESTAMP` values using addition or subtraction.
### Tell me the query, PLEASE!
Okay - had a lot of background. Let's have at it! The query by itself in SQL is (see immediately below for runnable code in Python):
```sql
SELECT
A.arrival_time
,A.departure_time
,A.window_open
,A.window_close
,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
FROM (
SELECT *
,arrival_time - (INTERVAL 1 MINUTE) AS window_open
,departure_time + (INTERVAL 1 MINUTE) AS window_close
FROM data) A
LEFT JOIN (
SELECT *
,DATEDIFF('seconds', arrival_time, departure_time) AS duration
FROM data) B
ON ((B.arrival_time <= A.window_open AND
(B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR
(B.arrival_time >= A.window_open AND
B.departure_time <= A.window_close) OR
(B.arrival_time >= A.window_open AND
(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close))
GROUP BY 1, 2, 3, 4
```
A small, succinct query such as this will need a bit of explanation to take it all in. Here's one below, reproducible in Python (make sure to install `duckdb` first!). Expand it to view.
<details markdown="1"><summary>SQL with explanation.</summary>
```py
import duckdb as db
db.query("""
SELECT
A.arrival_time
,A.departure_time
,A.window_open
,A.window_close
-- LIST aggregates the values into a LIST column
-- and LIST_DISTINCT finds the unique values in it
,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
-- finally, LIST_UNIQUE calculates the unique number of values in it
,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
FROM (
SELECT
*
,arrival_time - (INTERVAL 1 MINUTE) AS window_open
,departure_time + (INTERVAL 1 MINUTE) AS window_close
FROM data -- remember we defined data as the Polars DataFrame with our truck station data
) A
LEFT JOIN (
SELECT
*
-- This is the time, in seconds between the arrival and departure of
-- each truck PER ROW in the original data-frame
,DATEDIFF('seconds', arrival_time, departure_time) AS duration
FROM data -- this is where we perform a self-join
) B
ON (
-- Case 2 in the diagram;
(B.arrival_time <= A.window_open AND
-- Adding the duration here makes sure that the second interval
-- is at least ENDING AFTER the start of the overlap window
(B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR
-- Case 3 in the diagram - the simplest of all five cases
(B.arrival_time >= A.window_open AND
B.departure_time <= A.window_close) OR
-- Case 4 in the digram;
(B.arrival_time >= A.window_open AND
-- Subtracting the duration here makes sure that the second interval
-- STARTS BEFORE the end of the overlap window.
(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
)
GROUP BY 1, 2, 3, 4
""")
```
</details>
The output of this query is:
```
"""
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
"""
```
We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with `db.query(...).pl()` and `db.query(...).pd()` respectively.
## Can we make the SQL simpler?
Now that we've understood the logic that goes into the query, let's try to optimize the algorithm. We have the three conditions:
```sql
-- Case 2 in the diagram
(B.arrival_time <= A.window_open AND
(B.arrival_time + TO_SECONDS(B.duration)) >= A.window_open) OR
-- Case 3 in the diagram
(B.arrival_time >= A.window_open AND
B.departure_time <= A.window_close) OR
-- Case 4 in the diagram
(B.arrival_time >= A.window_open AND
(B.departure_time - TO_SECONDS(B.duration)) <= A.window_close)
```
What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be *before* the window ends, and the end of the overlap to be *after* the window starts. This can be simplified to just:
```sql
B.arrival_time <= A.window_close AND
B.departure_time >= A.window_open
```
making our query much simpler!
### Simplified SQL: Part 1
We've removed the need for the `duration` calculation algother now. Therefore, we can write:
```sql
SELECT
A.arrival_time
,A.departure_time
,A.window_open
,A.window_close
,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
FROM (
SELECT *
,arrival_time - (INTERVAL 1 MINUTE) AS window_open
,departure_time + (INTERVAL 1 MINUTE) AS window_close
FROM data) A
LEFT JOIN data B
ON (
B.arrival_time <= A.window_close AND
B.departure_time >= A.window_open
)
GROUP BY 1, 2, 3, 4
```
Can we simplify this even further?
### Simplification: Part 2
I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB's extensive optimizations to simplify our **legibility** by rewriting the query as a cross join:
```sql
SELECT
A.arrival_time
,A.departure_time
,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
FROM data A, data B
WHERE B.arrival_time <= window_close
AND B.departure_time >= window_open
GROUP BY 1, 2, 3, 4
```
Why does this work? Before optimization on DuckDB, this is what the query plan looks like:
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
```py
"""
┌───────────────────────────┐
│ PROJECTION │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ docked_trucks │
│ docked_truck_count │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ AGGREGATE │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ arrival_time │
│ departure_time │
│ window_open │
│ window_close │
│ list(ID) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ FILTER │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ (arrival_time <= │
│(departure_time + to_m... │
│ AS BIGINT)))) │
│ (departure_time >= │
│(arrival_time - to_min... │
│ AS BIGINT)))) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ CROSS_PRODUCT ├──────────────┐
└─────────────┬─────────────┘ │
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│ ARROW_SCAN ││ ARROW_SCAN │
└───────────────────────────┘└───────────────────────────┘
"""
```
</details>
After optimization, the `CROSS_PRODUCT` is **automatically** optimized to an **interval join**!
<details markdown="1"><summary>DuckDB query after before optimization</summary>
```py
"""
┌───────────────────────────┐
│ PROJECTION │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ docked_trucks │
│ docked_truck_count │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ AGGREGATE │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ arrival_time │
│ departure_time │
│ window_open │
│ window_close │
│ list(ID) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ COMPARISON_JOIN │
│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
│ INNER │
│ ((departure_time + '00:01 │
│ :00'::INTERVAL) >= ├──────────────┐
│ arrival_time) │ │
│((arrival_time - '00:01:00'│ │
│ ::INTERVAL) <= │ │
│ departure_time) │ │
└─────────────┬─────────────┘ │
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│ ARROW_SCAN ││ ARROW_SCAN │
└───────────────────────────┘└───────────────────────────┘
"""
```
</details>
So in effect, we're actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn't recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.
### How to get query plans?
I'm glad you asked. Here's the DuckDB [page explaining `EXPLAIN`](https://duckdb.org/docs/guides/meta/explain.html) (heh). Here's the code I used:
```py
import duckdb as db
db.sql("SET EXPLAIN_OUTPUT='all';")
print(db.query("""
EXPLAIN
SELECT
A.arrival_time
,A.departure_time
,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
FROM data A, data B
WHERE B.arrival_time <= window_close
AND B.departure_time >= window_open
GROUP BY 1, 2, 3, 4
""").pl()[1, 1])
```
# What are the alternatives?
## The `data.table` way
[`data.table`](https://github.com/Rdatatable/data.table) is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely [pick back up](https://github.com/Rdatatable/data.table/issues/5656). It's my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.
### The `foverlaps` function
If this kind of overlapping join is common, shouldn't someone have developed a package for it? Turns out, `data.table` has, and with very specific constraints that make it the perfect solution to our problem (if you don't mind switching over to R, that is).
The `foverlaps` function has these requirements:
1. The input `data.table` objects have to be keyed for automatic recognition of columns.
2. The default match type is that it matches all three cases from the image above. Side note: it also has matches for `within` overlap, matching `start` and `end` windows,
3. The last two matching columns in the join condition in `by` must specify the `start` and `end` points of the overlapping window. This isn't a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.
### The code, _si_, the code!
Without further ado:
```r
library(data.table)
library(lubridate)
######### BOILERPLATE CODE, NO LOGIC HERE ####################
arrival_time = as_datetime(c(
'2023-01-01 06:23:47.000000', '2023-01-01 06:26:42.000000',
'2023-01-01 06:30:20.000000', '2023-01-01 06:32:06.000000',
'2023-01-01 06:33:09.000000', '2023-01-01 06:34:08.000000',
'2023-01-01 06:36:40.000000', '2023-01-01 06:37:43.000000',
'2023-01-01 06:39:48.000000'))
departure_time = as_datetime(c(
'2023-01-01 06:25:08.000000', '2023-01-01 06:28:02.000000',
'2023-01-01 06:35:01.000000', '2023-01-01 06:33:48.000000',
'2023-01-01 06:36:01.000000', '2023-01-01 06:39:49.000000',
'2023-01-01 06:38:34.000000', '2023-01-01 06:40:48.000000',
'2023-01-01 06:46:10.000000'))
ID = c('A1', 'A1', 'A5', 'A6', 'B3', 'C3', 'A6', 'A5', 'A6')
DT = data.table(
arrival_time = arrival_time,
departure_time = departure_time,
ID = ID)
######### BOILERPLATE CODE, NO LOGIC HERE ####################
# A copy(DT) creates a copy of a data.table that isn't linked
# to the original one, so that changes in it don't reflect in
# the original DT object.
# The `:=` allow assignment by reference (i.e. "in place").
DT_with_windows = copy(DT)[, `:=`(
window_start = arrival_time - minutes(1),
window_end = departure_time + minutes(1))]
# This step is necessary for the second table, but not the first, but we
# key both data.tables to make the foverlap code very succinct.
setkeyv(DT, c("arrival_time", "departure_time"))
setkeyv(DT_with_windows, c("window_start", "window_end"))
# The foverlap function returns a data.table, so we can simply apply
# the usual data.table syntax on it!
# Since we have the same name of some columns in both data.tables,
# the latter table's columns are prefixed with "i." to avoid conflicts.
foverlaps(DT, DT_with_windows)[
, .(docked_trucks = list(unique(i.ID)),
docked_truck_count = uniqueN(i.ID))
, .(arrival_time, departure_time)]
```
provides us the output:
```r
arrival_time departure_time docked_trucks docked_truck_count
<POSc> <POSc> <list> <int>
1: 2023-01-01 06:23:47 2023-01-01 06:25:08 A1 1
2: 2023-01-01 06:26:42 2023-01-01 06:28:02 A1 1
3: 2023-01-01 06:30:20 2023-01-01 06:35:01 A5,A6,B3,C3 4
4: 2023-01-01 06:32:06 2023-01-01 06:33:48 A5,A6,B3,C3 4
5: 2023-01-01 06:33:09 2023-01-01 06:36:01 A5,A6,B3,C3 4
6: 2023-01-01 06:34:08 2023-01-01 06:39:49 A5,A6,B3,C3 4
7: 2023-01-01 06:36:40 2023-01-01 06:38:34 B3,C3,A6,A5 4
8: 2023-01-01 06:37:43 2023-01-01 06:40:48 C3,A6,A5 3
9: 2023-01-01 06:39:48 2023-01-01 06:46:10 C3,A5,A6 3
```
### Considerations for using `data.table`
The package offers a wonderful, nearly one-stop solution that doesn't require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?
Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you'll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.

View File

Before

Width:  |  Height:  |  Size: 137 KiB

After

Width:  |  Height:  |  Size: 137 KiB

View File

Before

Width:  |  Height:  |  Size: 226 KiB

After

Width:  |  Height:  |  Size: 226 KiB

View File

Before

Width:  |  Height:  |  Size: 296 KiB

After

Width:  |  Height:  |  Size: 296 KiB

View File

@@ -1,7 +1,7 @@
---
title: Finding representative samples efficiently for large datasets
permalink: /RepresentativeSample
author: Avinash Mallya
date: 2023-10-19
tags: [representative, samples, faiss, approximate, nearest, neighbor, network, graph, networkx, polars, category]
---
@@ -234,11 +234,11 @@ The next step in the process is to create a network graph using the edge-list. B
Remember that we have identified the (k=5) nearest neighbors of **each** data point. Let's say that we have a point A that has a nearest neighbor B. C is **not** a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular *minimum thershold*, then A will be connected to C through B! Hopefully a small visual below would help.
![How a network component is formed.](./assets/003_representative_samples/001_Network_Cluster_1.png)
![How a network component is formed.](001_Network_Cluster_1.png)
What happens when such a concept is extended for many data points? Not all of them would be connected - because we're applying a *minimum* threshold that they have to meet. This is the only hueristic part of the rather fast process. Here's one more helpful visual:
![How a network cluster is formed.](./assets/003_representative_samples/002_Network_Cluster_2.png)
![How a network cluster is formed.](002_Network_Cluster_2.png)
Very starry night-eque vibes here. Let's get to the code.

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

Before

Width:  |  Height:  |  Size: 52 KiB

After

Width:  |  Height:  |  Size: 52 KiB

View File

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

View File

Before

Width:  |  Height:  |  Size: 247 KiB

After

Width:  |  Height:  |  Size: 247 KiB

View File

Before

Width:  |  Height:  |  Size: 210 KiB

After

Width:  |  Height:  |  Size: 210 KiB

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 311 KiB

After

Width:  |  Height:  |  Size: 311 KiB

View File

Before

Width:  |  Height:  |  Size: 294 KiB

After

Width:  |  Height:  |  Size: 294 KiB

View File

@@ -1,7 +1,7 @@
---
title: Quick hacks to make client-ready presentations
permalink: /PowerPointSnap
author: Avinash Mallya
date: 2023-10-20
tags: [powerpoint, ppt, vba]
---
@@ -40,7 +40,7 @@ Here's a non-exhaustive list of all the options available.
This is the part of the interface that can be used for shapes (which include charts and tables).
![The UI for copying *shape* properties](./assets/002_power_point_snap/01_Shapes.png)
![The UI for copying *shape* properties](01_Shapes.png)
To use, first select a *shape* object, click on "Set". Then, choose the object you want to *Snap* its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.
@@ -50,7 +50,7 @@ Note that it's probably not to apply a property of a shape to a table - if you w
Charts are also supported, with dedicated features for it.
![The UI for copying *chart* properties](./assets/002_power_point_snap/02_Charts.png)
![The UI for copying *chart* properties](02_Charts.png)
What do these features do? You should be able to hover over the option and get a tooltip that shows what it's capable of, but here's another summary just in case:
@@ -67,7 +67,7 @@ Your immediate senior in a consulting environment would frown at your chart, and
It's **never** a one time affair. But don't worry, we have this nice feature to help us. If you click on the *Customize Label* option, you will get this (without the "Set" option):
![The UI for customizing labels.](./assets/002_power_point_snap/DataLabelsScreenshot.JPG)
![The UI for customizing labels.](DataLabelsScreenshot.JPG)
Never mind the rather unfriendly legend entries. They're just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!
@@ -75,7 +75,7 @@ Never mind the rather unfriendly legend entries. They're just here to demonstrat
Of course, visuals will do it more justice. For example, look at this image:
![There's a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren't centered.](./assets/002_power_point_snap/Revenue_Presentation_1.png)
![There's a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles aren't centered.](Revenue_Presentation_1.png)
Here's what you can do:
@@ -95,7 +95,7 @@ Here's what you can do:
This is what your results should look like:
![Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly... maybe they should change some requirements...](./assets/002_power_point_snap/Revenue_Presentation_2.png)
![Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly... maybe they should change some requirements...](Revenue_Presentation_2.png)
Of course, getting those calculations right is a whole different thing that will need some work.
@@ -103,11 +103,11 @@ Of course, getting those calculations right is a whole different thing that will
Oftentimes, you have two tables that show similar values... you know the drill. Here's what you can do in a scenario such as this:
![Similar data, but vastly different tables.](./assets/002_power_point_snap/Table_Presentation_1.png)
![Similar data, but vastly different tables.](Table_Presentation_1.png)
This is what the *Tables* section of the tool looks like:
![The UI for *Tables*](./assets/002_power_point_snap/03_Tables.png)
![The UI for *Tables*](03_Tables.png)
To align these tables together,
@@ -119,7 +119,7 @@ To align these tables together,
Here's what you'll end up with:
![Similar data, and similar enough tables.](./assets/002_power_point_snap/Table_Presentation_2.png)
![Similar data, and similar enough tables.](Table_Presentation_2.png)
Pretty neat, eh?

5
content/blog/_index.md Normal file
View File

@@ -0,0 +1,5 @@
---
title: "blog"
menu: "main"
weight: 2
---

18
content/projects.md Normal file
View File

@@ -0,0 +1,18 @@
---
title: "projects"
menu: "main"
weight: 3
---
Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.
# Featured projects
1. [BorrowChecker](https://avimallu.github.io/BorrowChecker/): A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. [Repository link](https://github.com/avimallu/BorrowChecker).
2. [PowerPointSnap](https://github.com/avimallu/PowerPointSnap): A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying [blog post]({{< ref "blog/003_powerpointsnap">}}).
# Other work or contributions
1. [IntelligentReceiptSplitter](https://github.com/avimallu/IntelligentReceiptSplitter): A relatively simple predecessor to [BorrowChecker](https://avimallu.github.io/BorrowChecker/) that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.
2. [r.data.table.funs](https://github.com/avimallu/r.data.table.funs): A very small set of R functions that use `data.table`, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.
3. I [wrote](https://github.com/pola-rs/polars-book/pull/364) [several](https://github.com/pola-rs/polars-book/pull/358) [chapters](https://github.com/pola-rs/polars-book/pull/365/files) of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like `data.table` and `dplyr` dominated), so I was eager to make it better for everybody making the switch.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

81
hugo.toml Normal file
View File

@@ -0,0 +1,81 @@
baseURL = "https://avimallu.dev/"
theme = "hugo-bearcub"
copyright = "© Avinash Mallya"
defaultContentLanguage = "en"
# Generate a nice robots.txt for SEO
enableRobotsTXT = true
# Setup syntax highlighting without inline styles. For more information about
# why you'd want to avoid inline styles, see
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy/style-src#unsafe_inline_styles
[markup]
[markup.highlight]
lineNos = true
lineNumbersInTable = false
# This allows Bear Cub to use a variation of Dracula that is more accessible
# to people with poor eyesight. For more information about color contrast
# and accessibility, see https://web.dev/color-and-contrast-accessibility/
noClasses = false
[markup.goldmark]
[markup.goldmark.renderer]
unsafe = true
# Multilingual mode config. More for information about how to setup translation,
# see https://gohugo.io/content-management/multilingual/
[languages]
[languages.en]
title = "Avinash's Blog"
languageName = "en-US 🇺🇸"
LanguageCode = "en-US"
contentDir = "content"
[languages.en.params]
madeWith = "Design via [Bear Cub](https://github.com/clente/hugo-bearcub)."
[params]
# The description of your website
# description = ""
# These images will show up when services want to generate a preview of a link
# to your site. Ignored if `generateSocialCard = true`. For more information
# about previews, see https://gohugo.io/templates/internal#twitter-cards and
# https://gohugo.io/templates/internal#open-graph
images = ["static/favicon.ico"]
# This title is used as the site_name on the Hugo's internal opengraph
# structured data template
title = "Avinash's Blog"
# Dates are displayed following the format below. For more information about
# formatting, see https://gohugo.io/functions/format/
dateFormat = "2006-01-02"
# If your blog is multilingual but you haven't translated a page, this theme
# will create a disabled link. By setting `hideUntranslated` to true, you can
# have the theme simply not show any link
hideUntranslated = false
# (EXPERIMENTAL) This theme has two options for its CSS styles: "original" and
# "herman". The former is what you see on Bear Cub's demo (an optimized
# version of Hugo Bear Blog), while the latter has a more modern look based on
# Herman Martinus's version of the Blogster Minimal theme for Astro.
themeStyle = "original"
# (EXPERIMENTAL) This theme is capable of dynamically generating social cards
# for posts that don't have `images` defined in their front matter; By setting
# `generateSocialCard` to false, you can prevent this behavior. For more
# information see layouts/partials/social_card.html
generateSocialCard = false
# Social media. Delete any item you aren't using to make sure it won't show up
# in your website's metadata.
[params.social]
# twitter = "example" # Twitter handle (without '@')
# facebook_admin = "0000000000" # Facebook Page Admin ID
# Author metadata. This is mostly used for the RSS feed of your site, but the
# email is also added to the footer of each post. You can hide the "reply to"
# link by using a `hideReply` param in front matter.
[params.author]
# name = "Avinash Mallya" # Your name as shown in the RSS feed metadata
# email = "nah@example.com" # Added to the footer so readers can reply to posts

View File

@@ -1,18 +0,0 @@
---
# Feel free to add content and custom Front Matter to this file.
# To modify the layout, see https://jekyllrb.com/docs/themes/#overriding-theme-defaults
---
# Hey there!
From LinkedIn or Github? Then you probably want to know a little bit [about me](./about.md).
Bookmarked this page for some of my informative posts? See a list of them below.
# Interesting Problems
[*Fast Overlap Joins* to find the number of trucks at a station during time intervals](./_posts/2023-06-22-overlap_joins.md).
[Quick PowerPoint hacks to make client-ready presentations](./_posts/2023-10-20-PowerPointSnap.md).
[Finding representative samples efficiently for large datasets](./_posts/2023-10-19-Finding_Rep_Samples.md).

22
layouts/partials/nav.html Normal file
View File

@@ -0,0 +1,22 @@
{{ range .Site.Menus.main.ByWeight }}
<a href="{{ .URL }}">{{ .Name }}</a>
{{ end }}
<a href='{{ absURL "index.xml" }}'>rss</a>
<!-- Convert this page's translations into a dict -->
{{ $translations := dict }}
{{ range .Translations }}
{{ $translations = merge $translations (dict .Language.Lang .) }}
{{ end }}
<!-- Create a link to every translation -->
{{ range where .Site.Languages "Lang" "!=" .Page.Lang }}
{{ with (index $translations .Lang) }}
<a href="{{ .RelPermalink }}">{{ .Language.LanguageName }}</a>
{{ else }}
<!-- The complicated setup was necessary to make a grayed out link -->
{{ if not .Params.hideUntranslated }}
<a class="disabled" role="link" aria-disabled="true">{{ .LanguageName }}</a>
{{ end }}
{{ end }}
{{ end }}

5
public/404.html Normal file
View File

@@ -0,0 +1,5 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>404</title><meta name=title content="404 Page not found"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/404.html"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="404 Page not found"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="404 Page not found"><meta itemprop=name content="404 Page not found"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><h1>404</h1><h2>ʕノ•ᴥ•ʔノ ︵ ┻━┻</h2></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,569 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Overlap Joins: Number of docker trucks in an interval | Avinash&#39;s Blog</title>
<meta name="title" content="Overlap Joins: Number of docker trucks in an interval" />
<meta name="description" content="Premise
I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement
Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID." />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/blog/001_overlap_joins/001_overlap_joins/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="Overlap Joins: Number of docker trucks in an interval">
<meta property="og:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="article">
<meta property="article:section" content="blog">
<meta property="article:published_time" content="2023-06-22T00:00:00+00:00">
<meta property="article:modified_time" content="2023-06-22T00:00:00+00:00">
<meta property="og:image" content="http://localhost:1313/static/favicon.ico">
<meta property="fb:admins" content="0000000000">
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:image" content="http://localhost:1313/static/favicon.ico">
<meta name="twitter:title" content="Overlap Joins: Number of docker trucks in an interval">
<meta name="twitter:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta name="twitter:site" content="@example">
<meta itemprop="name" content="Overlap Joins: Number of docker trucks in an interval">
<meta itemprop="description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta itemprop="datePublished" content="2023-06-22T00:00:00+00:00">
<meta itemprop="dateModified" content="2023-06-22T00:00:00+00:00">
<meta itemprop="wordCount" content="3078">
<meta itemprop="image" content="http://localhost:1313/static/favicon.ico">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/original.min.css" rel="stylesheet">
<link href="/syntax.min.css" rel="stylesheet">
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href="/">Home</a>
<a href="/blog/">Blog</a>
<a href='http://localhost:1313/index.xml'>RSS</a>
</nav>
</header>
<main id="main-content">
<h1>Overlap Joins: Number of docker trucks in an interval</h1>
<p class="byline">
<time datetime='2023-06-22' pubdate>
2023-06-22
</time>
</p>
<content>
<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<details markdown="1"><summary>SQL with explanation.</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div></details>
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<details markdown="1"><summary>DuckDB query after before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
</content>
<p>
</p>
<p>
<a href='mailto:nah@example.com?subject=Reply%20to%20"Overlap%20Joins%3a%20Number%20of%20docker%20trucks%20in%20an%20interval"'>
Reply to this post by email ↪
</a>
</p>
</main>
<footer><small>
© Avinash Mallya | Design via <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>.
</small></footer>
</body>
</html>

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 296 KiB

View File

@@ -0,0 +1,243 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Finding representative samples efficiently for large datasets | Avinash's Blog</title><meta name=title content="Finding representative samples efficiently for large datasets"><meta name=description content="Premise
In this day and age, we&rsquo;re not short on data. Good data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.
Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:
You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.
You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.
You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.
In a hurry?
Here&rsquo;s what you need to do:"><meta name=author content="Avinash Mallya"><meta name=keywords content="representative,samples,faiss,approximate,nearest,neighbor,network,graph,networkx,polars,category,"><meta property="og:url" content="https://avimallu.dev/blog/002_representative_samples/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Finding representative samples efficiently for large datasets"><meta property="og:description" content="Premise In this day and age, were not short on data. Good data, on the other hand, is very valuable. When youve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.
Lets formalize the problem a little so that a proper approach can be developed. Heres the problem statement:
You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples. In a hurry? Heres what you need to do:"><meta property="og:locale" content="en_US"><meta property="og:type" content="article"><meta property="article:section" content="blog"><meta property="article:published_time" content="2023-10-19T00:00:00+00:00"><meta property="article:modified_time" content="2023-10-19T00:00:00+00:00"><meta property="article:tag" content="Representative"><meta property="article:tag" content="Samples"><meta property="article:tag" content="Faiss"><meta property="article:tag" content="Approximate"><meta property="article:tag" content="Nearest"><meta property="article:tag" content="Neighbor"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Finding representative samples efficiently for large datasets"><meta name=twitter:description content="Premise In this day and age, were not short on data. Good data, on the other hand, is very valuable. When youve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.
Lets formalize the problem a little so that a proper approach can be developed. Heres the problem statement:
You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples. In a hurry? Heres what you need to do:"><meta itemprop=name content="Finding representative samples efficiently for large datasets"><meta itemprop=description content="Premise In this day and age, were not short on data. Good data, on the other hand, is very valuable. When youve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.
Lets formalize the problem a little so that a proper approach can be developed. Heres the problem statement:
You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix. You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels. You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples. In a hurry? Heres what you need to do:"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=wordCount content="3202"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta itemprop=keywords content="Representative,Samples,Faiss,Approximate,Nearest,Neighbor,Network,Graph,Networkx,Polars,Category"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link href=/syntax.min.css rel=stylesheet></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><h1>Finding representative samples efficiently for large datasets</h1><p class=byline><time datetime=2023-10-19 pubdate>2023-10-19
</time>· Avinash Mallya</p><content><h1 id=premise>Premise</h1><p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p><p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p><ol><li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li><li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li><li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li></ol><h2 id=in-a-hurry>In a hurry?</h2><p>Here&rsquo;s what you need to do:</p><ol><li>Read the premise and see if it fits your problem.</li><li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li></ol><h2 id=why-do-we-need-representative-samples>Why do we need representative samples?</h2><p>Generally, three things come to mind:</p><ol><li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li><li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li><li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li></ol><h1 id=define-the-data>Define the data</h1><p>This data can be practically anything that can be represented as a 2D matrix.</p><p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p><h2 id=get-a-specific-dataset>Get a specific dataset</h2><p>For this specific article, I will use the <a href=https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data>ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p><blockquote><p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>>></code> indicating the command used, and the output to be without those prefixes.</p></blockquote><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=o>&gt;&gt;</span> <span class=kn>import</span> <span class=nn>polars</span> <span class=k>as</span> <span class=nn>pl</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl><span class=o>&gt;&gt;</span> <span class=n>data</span> <span class=o>=</span> <span class=n>pl</span><span class=o>.</span><span class=n>read_csv</span><span class=p>(</span><span class=s2>&#34;archive/shopmania.csv&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl><span class=o>&gt;&gt;</span> <span class=n>data</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl><span class=n>shape</span><span class=p>:</span> <span class=p>(</span><span class=mi>313_705</span><span class=p>,</span> <span class=mi>4</span><span class=p>)</span>
</span></span><span class=line><span class=ln> 5</span><span class=cl><span class=err>┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class=line><span class=ln> 6</span><span class=cl><span class=err></span> <span class=n>product_ID</span> <span class=err></span> <span class=n>product_title</span> <span class=err></span> <span class=n>category_ID</span> <span class=err></span> <span class=n>category_label</span> <span class=err></span>
</span></span><span class=line><span class=ln> 7</span><span class=cl><span class=err></span> <span class=o>---</span> <span class=err></span> <span class=o>---</span> <span class=err></span> <span class=o>---</span> <span class=err></span> <span class=o>---</span> <span class=err></span>
</span></span><span class=line><span class=ln> 8</span><span class=cl><span class=err></span> <span class=n>i64</span> <span class=err></span> <span class=nb>str</span> <span class=err></span> <span class=n>i64</span> <span class=err></span> <span class=nb>str</span> <span class=err></span>
</span></span><span class=line><span class=ln> 9</span><span class=cl><span class=err>╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class=line><span class=ln>10</span><span class=cl><span class=err></span> <span class=mi>2</span> <span class=err></span> <span class=n>twilight</span> <span class=n>central</span> <span class=n>park</span> <span class=nb>print</span> <span class=err></span> <span class=mi>2</span> <span class=err></span> <span class=n>Collectibles</span> <span class=err></span>
</span></span><span class=line><span class=ln>11</span><span class=cl><span class=err></span> <span class=mi>3</span> <span class=err></span> <span class=n>fox</span> <span class=nb>print</span> <span class=err></span> <span class=mi>2</span> <span class=err></span> <span class=n>Collectibles</span> <span class=err></span>
</span></span><span class=line><span class=ln>12</span><span class=cl><span class=err></span> <span class=mi>4</span> <span class=err></span> <span class=n>circulo</span> <span class=n>de</span> <span class=n>papel</span> <span class=n>wall</span> <span class=n>art</span> <span class=err></span> <span class=mi>2</span> <span class=err></span> <span class=n>Collectibles</span> <span class=err></span>
</span></span><span class=line><span class=ln>13</span><span class=cl><span class=err></span> <span class=mi>5</span> <span class=err></span> <span class=n>hidden</span> <span class=n>path</span> <span class=nb>print</span> <span class=err></span> <span class=mi>2</span> <span class=err></span> <span class=n>Collectibles</span> <span class=err></span>
</span></span><span class=line><span class=ln>14</span><span class=cl><span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>15</span><span class=cl><span class=err></span> <span class=mi>313703</span> <span class=err></span> <span class=n>deago</span> <span class=n>anti</span> <span class=n>fog</span> <span class=n>swimming</span> <span class=n>diving</span> <span class=n>full</span> <span class=n>face</span> <span class=n>mask</span> <span class=err></span> <span class=mi>229</span> <span class=err></span> <span class=n>Water</span> <span class=n>Sports</span> <span class=err></span>
</span></span><span class=line><span class=ln>16</span><span class=cl><span class=err></span> <span class=err></span> <span class=n>surface</span> <span class=n>snorkel</span> <span class=n>scuba</span> <span class=n>fr</span> <span class=n>gopro</span> <span class=n>black</span> <span class=n>s</span><span class=o>/</span><span class=n>m</span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>17</span><span class=cl><span class=err></span> <span class=mi>313704</span> <span class=err></span> <span class=n>etc</span> <span class=n>buys</span> <span class=n>full</span> <span class=n>face</span> <span class=n>gopro</span> <span class=n>compatible</span> <span class=n>snorkel</span> <span class=n>scuba</span> <span class=err></span> <span class=mi>229</span> <span class=err></span> <span class=n>Water</span> <span class=n>Sports</span> <span class=err></span>
</span></span><span class=line><span class=ln>18</span><span class=cl><span class=err></span> <span class=err></span> <span class=n>diving</span> <span class=n>mask</span> <span class=n>blue</span> <span class=n>large</span><span class=o>/</span><span class=n>xtralarge</span> <span class=n>blue</span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>19</span><span class=cl><span class=err></span> <span class=mi>313705</span> <span class=err></span> <span class=n>men</span> <span class=mi>039</span> <span class=n>s</span> <span class=n>full</span> <span class=n>face</span> <span class=n>breathe</span> <span class=n>free</span> <span class=n>diving</span> <span class=n>snorkel</span> <span class=n>mask</span> <span class=err></span> <span class=mi>229</span> <span class=err></span> <span class=n>Water</span> <span class=n>Sports</span> <span class=err></span>
</span></span><span class=line><span class=ln>20</span><span class=cl><span class=err></span> <span class=err></span> <span class=n>scuba</span> <span class=n>optional</span> <span class=n>hd</span> <span class=n>camera</span> <span class=n>blue</span> <span class=n>mask</span> <span class=n>only</span> <span class=n>adult</span> <span class=n>men</span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>21</span><span class=cl><span class=err></span> <span class=mi>313706</span> <span class=err></span> <span class=n>women</span> <span class=mi>039</span> <span class=n>s</span> <span class=n>full</span> <span class=n>face</span> <span class=n>breathe</span> <span class=n>free</span> <span class=n>diving</span> <span class=n>snorkel</span> <span class=err></span> <span class=mi>229</span> <span class=err></span> <span class=n>Water</span> <span class=n>Sports</span> <span class=err></span>
</span></span><span class=line><span class=ln>22</span><span class=cl><span class=err></span> <span class=err></span> <span class=n>mask</span> <span class=n>scuba</span> <span class=n>optional</span> <span class=n>hd</span> <span class=n>camera</span> <span class=n>black</span> <span class=n>mask</span> <span class=n>only</span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>23</span><span class=cl><span class=err></span> <span class=err></span> <span class=n>children</span> <span class=ow>and</span> <span class=n>women</span> <span class=err></span> <span class=err></span> <span class=err></span>
</span></span><span class=line><span class=ln>24</span><span class=cl><span class=err>└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p><blockquote><p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p></blockquote><p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=n>data</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>2</span><span class=cl> <span class=n>data</span>
</span></span><span class=line><span class=ln>3</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>count</span><span class=p>()</span><span class=o>.</span><span class=n>over</span><span class=p>(</span><span class=s2>&#34;category_ID&#34;</span><span class=p>)</span> <span class=o>==</span> <span class=mi>10000</span><span class=p>)</span>
</span></span><span class=line><span class=ln>4</span><span class=cl><span class=p>)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=o>&gt;&gt;&gt;</span> <span class=n>data</span><span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;category_label&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>unique</span><span class=p>()</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl><span class=n>shape</span><span class=p>:</span> <span class=p>(</span><span class=mi>17</span><span class=p>,)</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl><span class=n>Series</span><span class=p>:</span> <span class=s1>&#39;category_label&#39;</span> <span class=p>[</span><span class=nb>str</span><span class=p>]</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl><span class=p>[</span>
</span></span><span class=line><span class=ln> 5</span><span class=cl> <span class=s2>&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class=line><span class=ln> 6</span><span class=cl> <span class=s2>&#34;Scarves and wraps&#34;</span>
</span></span><span class=line><span class=ln> 7</span><span class=cl> <span class=s2>&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class=line><span class=ln> 8</span><span class=cl> <span class=s2>&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class=line><span class=ln> 9</span><span class=cl> <span class=s2>&#34;Cell Phones Accessories&#34;</span>
</span></span><span class=line><span class=ln>10</span><span class=cl> <span class=s2>&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class=line><span class=ln>11</span><span class=cl> <span class=s2>&#34;Jewelry&#34;</span>
</span></span><span class=line><span class=ln>12</span><span class=cl> <span class=s2>&#34;Belts&#34;</span>
</span></span><span class=line><span class=ln>13</span><span class=cl> <span class=s2>&#34;Men Lingerie&#34;</span>
</span></span><span class=line><span class=ln>14</span><span class=cl> <span class=s2>&#34;Crafts&#34;</span>
</span></span><span class=line><span class=ln>15</span><span class=cl> <span class=s2>&#34;Football&#34;</span>
</span></span><span class=line><span class=ln>16</span><span class=cl> <span class=s2>&#34;Medical Supplies&#34;</span>
</span></span><span class=line><span class=ln>17</span><span class=cl> <span class=s2>&#34;Adult&#34;</span>
</span></span><span class=line><span class=ln>18</span><span class=cl> <span class=s2>&#34;Hunting&#34;</span>
</span></span><span class=line><span class=ln>19</span><span class=cl> <span class=s2>&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class=line><span class=ln>20</span><span class=cl> <span class=s2>&#34;Pet Supply&#34;</span>
</span></span><span class=line><span class=ln>21</span><span class=cl> <span class=s2>&#34;Office Supplies&#34;</span>
</span></span><span class=line><span class=ln>22</span><span class=cl><span class=p>]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p><h2 id=specify-the-task>Specify the task</h2><p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p><blockquote><p>Craft a <em>small</em> representative sample for each category.</p></blockquote><p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p><h1 id=finding-representative-samples>Finding representative samples</h1><p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p><h2 id=getting-sentencetransformer-embeddings>Getting <code>SentenceTransformer</code> embeddings</h2><p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href=https://www.sbert.net/docs/installation.html>please check their website</a>.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=kn>import</span> <span class=nn>sentence_transformers</span>
</span></span><span class=line><span class=ln>2</span><span class=cl><span class=c1># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class=line><span class=ln>3</span><span class=cl><span class=n>ST</span> <span class=o>=</span> <span class=n>sentence_transformers</span><span class=o>.</span><span class=n>SentenceTransformer</span><span class=p>(</span><span class=s2>&#34;all-mpnet-base-v2&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>4</span><span class=cl><span class=n>title_embeddings</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>5</span><span class=cl> <span class=n>ST</span><span class=o>.</span><span class=n>encode</span><span class=p>(</span>
</span></span><span class=line><span class=ln>6</span><span class=cl> <span class=n>data</span><span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;product_title&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>to_list</span><span class=p>(),</span>
</span></span><span class=line><span class=ln>7</span><span class=cl> <span class=n>show_progress_bar</span><span class=o>=</span><span class=kc>True</span><span class=p>,</span> <span class=n>convert_to_tensor</span><span class=o>=</span><span class=kc>True</span><span class=p>)</span>
</span></span><span class=line><span class=ln>8</span><span class=cl> <span class=o>.</span><span class=n>numpy</span><span class=p>())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p><blockquote><p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p></blockquote><h2 id=the-concept-of-_approximate_-nearest-neighbors>The concept of <em>approximate</em> nearest neighbors</h2><p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p><p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href=https://github.com/erikbern/ann-benchmarks>available here</a>.</p><p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p><h3 id=building-the-database>Building the database</h3><p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=kn>import</span> <span class=nn>faiss</span>
</span></span><span class=line><span class=ln>2</span><span class=cl><span class=k>def</span> <span class=nf>create_index</span><span class=p>(</span><span class=n>title_embeddings</span><span class=p>):</span>
</span></span><span class=line><span class=ln>3</span><span class=cl> <span class=n>d</span> <span class=o>=</span> <span class=n>title_embeddings</span><span class=o>.</span><span class=n>shape</span><span class=p>[</span><span class=mi>1</span><span class=p>]</span> <span class=c1># Number of dimensions</span>
</span></span><span class=line><span class=ln>4</span><span class=cl> <span class=n>ann_index</span> <span class=o>=</span> <span class=n>faiss</span><span class=o>.</span><span class=n>IndexFlatL2</span><span class=p>(</span><span class=n>d</span><span class=p>)</span> <span class=c1># Index using Eucledian Matrix</span>
</span></span><span class=line><span class=ln>5</span><span class=cl> <span class=n>ann_index</span><span class=o>.</span><span class=n>add</span><span class=p>(</span><span class=n>title_embeddings</span><span class=p>)</span> <span class=c1># Build the index</span>
</span></span><span class=line><span class=ln>6</span><span class=cl>
</span></span><span class=line><span class=ln>7</span><span class=cl> <span class=k>return</span> <span class=n>ann_index</span> <span class=c1># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p><ol><li>The actual <code>faiss</code> database.</li><li>The actual subset of data that was used to build this index.</li><li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li></ol><p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=kn>import</span> <span class=nn>faiss</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl><span class=kn>import</span> <span class=nn>numpy</span> <span class=k>as</span> <span class=nn>np</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl><span class=kn>import</span> <span class=nn>polars</span> <span class=k>as</span> <span class=nn>pl</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl>
</span></span><span class=line><span class=ln> 5</span><span class=cl><span class=k>def</span> <span class=nf>create_index</span><span class=p>(</span><span class=n>label</span><span class=p>):</span>
</span></span><span class=line><span class=ln> 6</span><span class=cl> <span class=n>faiss_indices</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln> 7</span><span class=cl> <span class=n>data</span> <span class=c1># this needs to be an argument if you want to create a generic function</span>
</span></span><span class=line><span class=ln> 8</span><span class=cl> <span class=o>.</span><span class=n>with_row_count</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln> 9</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;category_label&#34;</span><span class=p>)</span> <span class=o>==</span> <span class=n>label</span><span class=p>)</span>
</span></span><span class=line><span class=ln>10</span><span class=cl> <span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>11</span><span class=cl> <span class=o>.</span><span class=n>to_list</span><span class=p>()</span>
</span></span><span class=line><span class=ln>12</span><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=ln>13</span><span class=cl>
</span></span><span class=line><span class=ln>14</span><span class=cl> <span class=n>faiss_data</span> <span class=o>=</span> <span class=n>title_embeddings</span><span class=p>[</span><span class=n>faiss_indices</span><span class=p>]</span>
</span></span><span class=line><span class=ln>15</span><span class=cl> <span class=n>d</span> <span class=o>=</span> <span class=n>data</span><span class=o>.</span><span class=n>shape</span><span class=p>[</span><span class=mi>1</span><span class=p>]</span> <span class=c1># Number of dimensions</span>
</span></span><span class=line><span class=ln>16</span><span class=cl> <span class=n>faiss_DB</span> <span class=o>=</span> <span class=n>faiss</span><span class=o>.</span><span class=n>IndexFlatIP</span><span class=p>(</span><span class=n>d</span><span class=p>)</span> <span class=c1># Index using Inner Product</span>
</span></span><span class=line><span class=ln>17</span><span class=cl> <span class=n>faiss</span><span class=o>.</span><span class=n>normalize_L2</span><span class=p>(</span><span class=n>data</span><span class=p>)</span> <span class=c1># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class=line><span class=ln>18</span><span class=cl> <span class=c1># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class=line><span class=ln>19</span><span class=cl> <span class=c1># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class=line><span class=ln>20</span><span class=cl> <span class=c1># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class=line><span class=ln>21</span><span class=cl> <span class=n>faiss_DB</span><span class=o>.</span><span class=n>add</span><span class=p>(</span><span class=n>data</span><span class=p>)</span> <span class=c1># Build the index</span>
</span></span><span class=line><span class=ln>22</span><span class=cl>
</span></span><span class=line><span class=ln>23</span><span class=cl> <span class=k>return</span> <span class=n>faiss_DB</span><span class=p>,</span> <span class=n>faiss_data</span><span class=p>,</span> <span class=n>faiss_indices</span></span></span></code></pre></div><h3 id=identifying-the-nearest-neighbors>Identifying the nearest neighbors</h3><p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=k>def</span> <span class=nf>get_edge_list</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=mi>5</span><span class=p>):</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl> <span class=n>faiss_DB</span><span class=p>,</span> <span class=n>faiss_data</span><span class=p>,</span> <span class=n>faiss_indices</span> <span class=o>=</span> <span class=n>create_index</span><span class=p>(</span><span class=n>label</span><span class=p>)</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl> <span class=c1># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl> <span class=n>faiss_indices_map</span> <span class=o>=</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=n>x</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=nb>enumerate</span><span class=p>(</span><span class=n>faiss_indices</span><span class=p>)}</span>
</span></span><span class=line><span class=ln> 5</span><span class=cl> <span class=c1># To map the indices back to the original strings</span>
</span></span><span class=line><span class=ln> 6</span><span class=cl> <span class=n>title_name_map</span> <span class=o>=</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=n>x</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=n>data</span><span class=o>.</span><span class=n>select</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>,</span> <span class=s2>&#34;product_title&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>rows</span><span class=p>()}</span>
</span></span><span class=line><span class=ln> 7</span><span class=cl> <span class=n>distances</span><span class=p>,</span> <span class=n>neighbors</span> <span class=o>=</span> <span class=n>faiss_DB</span><span class=o>.</span><span class=n>search</span><span class=p>(</span><span class=n>faiss_data</span><span class=p>,</span> <span class=n>k</span><span class=p>)</span>
</span></span><span class=line><span class=ln> 8</span><span class=cl>
</span></span><span class=line><span class=ln> 9</span><span class=cl> <span class=k>return</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>10</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>DataFrame</span><span class=p>({</span>
</span></span><span class=line><span class=ln>11</span><span class=cl> <span class=s2>&#34;from&#34;</span><span class=p>:</span> <span class=n>faiss_indices</span><span class=p>})</span>
</span></span><span class=line><span class=ln>12</span><span class=cl> <span class=o>.</span><span class=n>with_columns</span><span class=p>(</span>
</span></span><span class=line><span class=ln>13</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>Series</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>,</span> <span class=n>neighbors</span><span class=p>),</span>
</span></span><span class=line><span class=ln>14</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>Series</span><span class=p>(</span><span class=s2>&#34;distance&#34;</span><span class=p>,</span> <span class=n>distances</span><span class=p>))</span>
</span></span><span class=line><span class=ln>15</span><span class=cl> <span class=o>.</span><span class=n>explode</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>,</span> <span class=s2>&#34;distance&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>16</span><span class=cl> <span class=o>.</span><span class=n>with_columns</span><span class=p>(</span>
</span></span><span class=line><span class=ln>17</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;from&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>18</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>title_name_map</span><span class=p>),</span>
</span></span><span class=line><span class=ln>19</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>20</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>faiss_indices_map</span><span class=p>)</span>
</span></span><span class=line><span class=ln>21</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>title_name_map</span><span class=p>))</span>
</span></span><span class=line><span class=ln>22</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;from&#34;</span><span class=p>)</span> <span class=o>!=</span> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>))</span>
</span></span><span class=line><span class=ln>23</span><span class=cl> <span class=p>)</span> </span></span></code></pre></div><h3 id=networkx-and-connected-components>NetworkX and Connected Components</h3><p>The next step in the process is to create a network graph using the edge-list. But why?</p><p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p><p><img src=/blog/002_representative_samples/001_Network_Cluster_1.png alt="How a network component is formed."></p><p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p><p><img src=/blog/002_representative_samples/002_Network_Cluster_2.png alt="How a network cluster is formed."></p><p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=kn>import</span> <span class=nn>networkx</span> <span class=k>as</span> <span class=nn>nx</span>
</span></span><span class=line><span class=ln>2</span><span class=cl><span class=k>def</span> <span class=nf>get_cluster_map</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=mi>5</span><span class=p>,</span> <span class=n>min_cosine_distance</span><span class=o>=</span><span class=mf>0.95</span><span class=p>):</span>
</span></span><span class=line><span class=ln>3</span><span class=cl> <span class=n>edge_list</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>4</span><span class=cl> <span class=n>get_edge_list</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=n>k</span><span class=p>)</span>
</span></span><span class=line><span class=ln>5</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;distance&#34;</span><span class=p>)</span> <span class=o>&gt;=</span> <span class=n>min_cosine_distance</span><span class=p>)</span>
</span></span><span class=line><span class=ln>6</span><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=ln>7</span><span class=cl> <span class=n>graph</span> <span class=o>=</span> <span class=n>nx</span><span class=o>.</span><span class=n>from_pandas_edgelist</span><span class=p>(</span><span class=n>edge_list</span><span class=o>.</span><span class=n>to_pandas</span><span class=p>(),</span> <span class=n>source</span><span class=o>=</span><span class=s2>&#34;from&#34;</span><span class=p>,</span> <span class=n>target</span><span class=o>=</span><span class=s2>&#34;to&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>8</span><span class=cl> <span class=k>return</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=nb>list</span><span class=p>(</span><span class=n>x</span><span class=p>)</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=nb>enumerate</span><span class=p>(</span><span class=n>nx</span><span class=o>.</span><span class=n>connected_components</span><span class=p>(</span><span class=n>graph</span><span class=p>))}</span></span></span></code></pre></div><h1 id=getting-clusters>Getting clusters</h1><p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=n>clusters</span> <span class=o>=</span> <span class=n>get_cluster_map</span><span class=p>(</span><span class=s2>&#34;Cell Phones Accessories&#34;</span><span class=p>,</span> <span class=mi>5</span><span class=p>,</span> <span class=mf>0.95</span><span class=p>)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p><ol><li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li><li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li></ol><h2 id=viewing-the-components>Viewing the components</h2><p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=o>&gt;&gt;</span> <span class=n>clusters</span><span class=p>[</span><span class=mi>3</span><span class=p>]</span>
</span></span><span class=line><span class=ln>2</span><span class=cl><span class=p>[</span><span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>3</span><span class=cl> <span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>4</span><span class=cl> <span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>5</span><span class=cl> <span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>6</span><span class=cl> <span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>7</span><span class=cl> <span class=s1>&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class=p>]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=o>&gt;&gt;&gt;</span> <span class=n>clusters</span><span class=p>[</span><span class=mi>6</span><span class=p>]</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl><span class=p>[</span><span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 5</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 6</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 7</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln> 8</span><span class=cl> <span class=o>...</span>
</span></span><span class=line><span class=ln> 9</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>10</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>11</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>12</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>13</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>14</span><span class=cl> <span class=s1>&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class=p>]</span></span></span></code></pre></div><h2 id=running-for-all-categories>Running for all categories</h2><p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln>1</span><span class=cl><span class=n>clusters</span> <span class=o>=</span> <span class=p>[</span><span class=n>get_cluster_map</span><span class=p>(</span><span class=n>x</span><span class=p>,</span> <span class=mi>5</span><span class=p>,</span> <span class=mf>0.95</span><span class=p>)</span> <span class=k>for</span> <span class=n>x</span> <span class=ow>in</span> <span class=n>data</span><span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;category_label&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>unique</span><span class=p>()]</span></span></span></code></pre></div><h1 id=for-the-folks-in-a-hurry>For the folks in a hurry!</h1><p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p><h2 id=the-code>The code</h2><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=ln> 1</span><span class=cl><span class=kn>import</span> <span class=nn>sentence_transformers</span>
</span></span><span class=line><span class=ln> 2</span><span class=cl><span class=kn>import</span> <span class=nn>faiss</span>
</span></span><span class=line><span class=ln> 3</span><span class=cl><span class=kn>import</span> <span class=nn>polars</span> <span class=k>as</span> <span class=nn>pl</span>
</span></span><span class=line><span class=ln> 4</span><span class=cl><span class=kn>import</span> <span class=nn>numpy</span> <span class=k>as</span> <span class=nn>np</span>
</span></span><span class=line><span class=ln> 5</span><span class=cl>
</span></span><span class=line><span class=ln> 6</span><span class=cl><span class=c1># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class=line><span class=ln> 7</span><span class=cl><span class=c1># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class=line><span class=ln> 8</span><span class=cl><span class=n>data</span> <span class=o>=</span> <span class=n>pl</span><span class=o>.</span><span class=n>read_csv</span><span class=p>(</span><span class=s2>&#34;archive/shopmania.csv&#34;</span><span class=p>,</span> <span class=n>new_columns</span><span class=o>=</span><span class=p>[</span>
</span></span><span class=line><span class=ln> 9</span><span class=cl> <span class=s2>&#34;product_ID&#34;</span><span class=p>,</span> <span class=s2>&#34;product_title&#34;</span><span class=p>,</span> <span class=s2>&#34;category_ID&#34;</span><span class=p>,</span> <span class=s2>&#34;category_label&#34;</span><span class=p>])</span>
</span></span><span class=line><span class=ln>10</span><span class=cl><span class=n>data</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>11</span><span class=cl> <span class=n>data</span>
</span></span><span class=line><span class=ln>12</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>count</span><span class=p>()</span><span class=o>.</span><span class=n>over</span><span class=p>(</span><span class=s2>&#34;category_ID&#34;</span><span class=p>)</span> <span class=o>==</span> <span class=mi>10000</span><span class=p>)</span>
</span></span><span class=line><span class=ln>13</span><span class=cl> <span class=o>.</span><span class=n>with_row_count</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>14</span><span class=cl><span class=p>)</span>
</span></span><span class=line><span class=ln>15</span><span class=cl>
</span></span><span class=line><span class=ln>16</span><span class=cl>
</span></span><span class=line><span class=ln>17</span><span class=cl><span class=c1># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class=line><span class=ln>18</span><span class=cl><span class=n>ST</span> <span class=o>=</span> <span class=n>sentence_transformers</span><span class=o>.</span><span class=n>SentenceTransformer</span><span class=p>(</span><span class=s2>&#34;all-mpnet-base-v2&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>19</span><span class=cl><span class=n>title_embeddings</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>20</span><span class=cl> <span class=n>ST</span><span class=o>.</span><span class=n>encode</span><span class=p>(</span>
</span></span><span class=line><span class=ln>21</span><span class=cl> <span class=n>data</span><span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;product_title&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>to_list</span><span class=p>(),</span>
</span></span><span class=line><span class=ln>22</span><span class=cl> <span class=c1># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class=line><span class=ln>23</span><span class=cl> <span class=c1># if you&#39;ve got different hardware.</span>
</span></span><span class=line><span class=ln>24</span><span class=cl> <span class=n>device</span><span class=o>=</span><span class=s2>&#34;mps&#34;</span><span class=p>,</span>
</span></span><span class=line><span class=ln>25</span><span class=cl> <span class=n>show_progress_bar</span><span class=o>=</span><span class=kc>True</span><span class=p>,</span> <span class=n>convert_to_tensor</span><span class=o>=</span><span class=kc>True</span><span class=p>)</span>
</span></span><span class=line><span class=ln>26</span><span class=cl> <span class=o>.</span><span class=n>cpu</span><span class=p>()</span><span class=o>.</span><span class=n>numpy</span><span class=p>())</span>
</span></span><span class=line><span class=ln>27</span><span class=cl>
</span></span><span class=line><span class=ln>28</span><span class=cl><span class=c1># Code to create a FAISS index</span>
</span></span><span class=line><span class=ln>29</span><span class=cl><span class=k>def</span> <span class=nf>create_index</span><span class=p>(</span><span class=n>label</span><span class=p>):</span>
</span></span><span class=line><span class=ln>30</span><span class=cl> <span class=n>faiss_indices</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>31</span><span class=cl> <span class=n>data</span> <span class=c1># this needs to be an argument if you want to create a generic function</span>
</span></span><span class=line><span class=ln>32</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;category_label&#34;</span><span class=p>)</span> <span class=o>==</span> <span class=n>label</span><span class=p>)</span>
</span></span><span class=line><span class=ln>33</span><span class=cl> <span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>34</span><span class=cl> <span class=o>.</span><span class=n>to_list</span><span class=p>()</span>
</span></span><span class=line><span class=ln>35</span><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=ln>36</span><span class=cl>
</span></span><span class=line><span class=ln>37</span><span class=cl> <span class=n>faiss_data</span> <span class=o>=</span> <span class=n>title_embeddings</span><span class=p>[</span><span class=n>faiss_indices</span><span class=p>]</span>
</span></span><span class=line><span class=ln>38</span><span class=cl> <span class=n>d</span> <span class=o>=</span> <span class=n>faiss_data</span><span class=o>.</span><span class=n>shape</span><span class=p>[</span><span class=mi>1</span><span class=p>]</span> <span class=c1># Number of dimensions</span>
</span></span><span class=line><span class=ln>39</span><span class=cl> <span class=n>faiss_DB</span> <span class=o>=</span> <span class=n>faiss</span><span class=o>.</span><span class=n>IndexFlatIP</span><span class=p>(</span><span class=n>d</span><span class=p>)</span> <span class=c1># Index using Inner Product</span>
</span></span><span class=line><span class=ln>40</span><span class=cl> <span class=n>faiss</span><span class=o>.</span><span class=n>normalize_L2</span><span class=p>(</span><span class=n>faiss_data</span><span class=p>)</span> <span class=c1># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class=line><span class=ln>41</span><span class=cl> <span class=n>faiss_DB</span><span class=o>.</span><span class=n>add</span><span class=p>(</span><span class=n>faiss_data</span><span class=p>)</span> <span class=c1># Build the index</span>
</span></span><span class=line><span class=ln>42</span><span class=cl>
</span></span><span class=line><span class=ln>43</span><span class=cl> <span class=k>return</span> <span class=n>faiss_DB</span><span class=p>,</span> <span class=n>faiss_data</span><span class=p>,</span> <span class=n>faiss_indices</span>
</span></span><span class=line><span class=ln>44</span><span class=cl>
</span></span><span class=line><span class=ln>45</span><span class=cl><span class=c1># Code to create an edge-list</span>
</span></span><span class=line><span class=ln>46</span><span class=cl><span class=k>def</span> <span class=nf>get_edge_list</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=mi>5</span><span class=p>):</span>
</span></span><span class=line><span class=ln>47</span><span class=cl> <span class=n>faiss_DB</span><span class=p>,</span> <span class=n>faiss_data</span><span class=p>,</span> <span class=n>faiss_indices</span> <span class=o>=</span> <span class=n>create_index</span><span class=p>(</span><span class=n>label</span><span class=p>)</span>
</span></span><span class=line><span class=ln>48</span><span class=cl> <span class=c1># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class=line><span class=ln>49</span><span class=cl> <span class=n>faiss_indices_map</span> <span class=o>=</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=n>x</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=nb>enumerate</span><span class=p>(</span><span class=n>faiss_indices</span><span class=p>)}</span>
</span></span><span class=line><span class=ln>50</span><span class=cl> <span class=c1># To map the indices back to the original strings</span>
</span></span><span class=line><span class=ln>51</span><span class=cl> <span class=n>title_name_map</span> <span class=o>=</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=n>x</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=n>data</span><span class=o>.</span><span class=n>select</span><span class=p>(</span><span class=s2>&#34;row_idx&#34;</span><span class=p>,</span> <span class=s2>&#34;product_title&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>rows</span><span class=p>()}</span>
</span></span><span class=line><span class=ln>52</span><span class=cl> <span class=n>distances</span><span class=p>,</span> <span class=n>neighbors</span> <span class=o>=</span> <span class=n>faiss_DB</span><span class=o>.</span><span class=n>search</span><span class=p>(</span><span class=n>faiss_data</span><span class=p>,</span> <span class=n>k</span><span class=p>)</span>
</span></span><span class=line><span class=ln>53</span><span class=cl>
</span></span><span class=line><span class=ln>54</span><span class=cl> <span class=k>return</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>55</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>DataFrame</span><span class=p>({</span>
</span></span><span class=line><span class=ln>56</span><span class=cl> <span class=s2>&#34;from&#34;</span><span class=p>:</span> <span class=n>faiss_indices</span><span class=p>})</span>
</span></span><span class=line><span class=ln>57</span><span class=cl> <span class=o>.</span><span class=n>with_columns</span><span class=p>(</span>
</span></span><span class=line><span class=ln>58</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>Series</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>,</span> <span class=n>neighbors</span><span class=p>),</span>
</span></span><span class=line><span class=ln>59</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>Series</span><span class=p>(</span><span class=s2>&#34;distance&#34;</span><span class=p>,</span> <span class=n>distances</span><span class=p>))</span>
</span></span><span class=line><span class=ln>60</span><span class=cl> <span class=o>.</span><span class=n>explode</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>,</span> <span class=s2>&#34;distance&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>61</span><span class=cl> <span class=o>.</span><span class=n>with_columns</span><span class=p>(</span>
</span></span><span class=line><span class=ln>62</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;from&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>63</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>title_name_map</span><span class=p>),</span>
</span></span><span class=line><span class=ln>64</span><span class=cl> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>65</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>faiss_indices_map</span><span class=p>)</span>
</span></span><span class=line><span class=ln>66</span><span class=cl> <span class=o>.</span><span class=n>map_dict</span><span class=p>(</span><span class=n>title_name_map</span><span class=p>))</span>
</span></span><span class=line><span class=ln>67</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;from&#34;</span><span class=p>)</span> <span class=o>!=</span> <span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;to&#34;</span><span class=p>))</span>
</span></span><span class=line><span class=ln>68</span><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=ln>69</span><span class=cl>
</span></span><span class=line><span class=ln>70</span><span class=cl><span class=c1># Code to extract components from a Network Graph</span>
</span></span><span class=line><span class=ln>71</span><span class=cl><span class=kn>import</span> <span class=nn>networkx</span> <span class=k>as</span> <span class=nn>nx</span>
</span></span><span class=line><span class=ln>72</span><span class=cl><span class=k>def</span> <span class=nf>get_cluster_map</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=mi>5</span><span class=p>,</span> <span class=n>min_cosine_distance</span><span class=o>=</span><span class=mf>0.95</span><span class=p>):</span>
</span></span><span class=line><span class=ln>73</span><span class=cl> <span class=n>edge_list</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=ln>74</span><span class=cl> <span class=n>get_edge_list</span><span class=p>(</span><span class=n>label</span><span class=p>,</span> <span class=n>k</span><span class=o>=</span><span class=n>k</span><span class=p>)</span>
</span></span><span class=line><span class=ln>75</span><span class=cl> <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>pl</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&#34;distance&#34;</span><span class=p>)</span> <span class=o>&gt;=</span> <span class=n>min_cosine_distance</span><span class=p>)</span>
</span></span><span class=line><span class=ln>76</span><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=ln>77</span><span class=cl> <span class=n>graph</span> <span class=o>=</span> <span class=n>nx</span><span class=o>.</span><span class=n>from_pandas_edgelist</span><span class=p>(</span><span class=n>edge_list</span><span class=o>.</span><span class=n>to_pandas</span><span class=p>(),</span> <span class=n>source</span><span class=o>=</span><span class=s2>&#34;from&#34;</span><span class=p>,</span> <span class=n>target</span><span class=o>=</span><span class=s2>&#34;to&#34;</span><span class=p>)</span>
</span></span><span class=line><span class=ln>78</span><span class=cl> <span class=k>return</span> <span class=p>{</span><span class=n>i</span><span class=p>:</span> <span class=nb>list</span><span class=p>(</span><span class=n>x</span><span class=p>)</span> <span class=k>for</span> <span class=n>i</span><span class=p>,</span><span class=n>x</span> <span class=ow>in</span> <span class=nb>enumerate</span><span class=p>(</span><span class=n>nx</span><span class=o>.</span><span class=n>connected_components</span><span class=p>(</span><span class=n>graph</span><span class=p>))}</span>
</span></span><span class=line><span class=ln>79</span><span class=cl>
</span></span><span class=line><span class=ln>80</span><span class=cl><span class=c1># Example call to a single category to obtain its clusters</span>
</span></span><span class=line><span class=ln>81</span><span class=cl><span class=n>clusters</span> <span class=o>=</span> <span class=n>get_cluster_map</span><span class=p>(</span><span class=s2>&#34;Cell Phones Accessories&#34;</span><span class=p>,</span> <span class=mi>5</span><span class=p>,</span> <span class=mf>0.95</span><span class=p>)</span>
</span></span><span class=line><span class=ln>82</span><span class=cl><span class=c1># Example call to **all** categories to obtain all clusters</span>
</span></span><span class=line><span class=ln>83</span><span class=cl><span class=n>clusters</span> <span class=o>=</span> <span class=p>[</span><span class=n>get_cluster_map</span><span class=p>(</span><span class=n>x</span><span class=p>,</span> <span class=mi>5</span><span class=p>,</span> <span class=mf>0.95</span><span class=p>)</span> <span class=k>for</span> <span class=n>x</span> <span class=ow>in</span> <span class=n>data</span><span class=o>.</span><span class=n>get_column</span><span class=p>(</span><span class=s2>&#34;category_label&#34;</span><span class=p>)</span><span class=o>.</span><span class=n>unique</span><span class=p>()]</span></span></span></code></pre></div><h2 id=how-the-code-works>How the code works</h2><p>If you want to write down an algorithmic way of looking at this approach,</p><ol><li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li><li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li><li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li><li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li><li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li></ol></content><p><a class=blog-tags href=/tags/representative/>#representative</a>
<a class=blog-tags href=/tags/samples/>#samples</a>
<a class=blog-tags href=/tags/faiss/>#faiss</a>
<a class=blog-tags href=/tags/approximate/>#approximate</a>
<a class=blog-tags href=/tags/nearest/>#nearest</a>
<a class=blog-tags href=/tags/neighbor/>#neighbor</a>
<a class=blog-tags href=/tags/network/>#network</a>
<a class=blog-tags href=/tags/graph/>#graph</a>
<a class=blog-tags href=/tags/networkx/>#networkx</a>
<a class=blog-tags href=/tags/polars/>#polars</a>
<a class=blog-tags href=/tags/category/>#category</a></p></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 247 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 210 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 294 KiB

File diff suppressed because one or more lines are too long

21
public/blog/index.html Normal file
View File

@@ -0,0 +1,21 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>blog | Avinash's Blog</title><meta name=title content="blog"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/blog/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="blog"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="blog"><meta itemprop=name content="blog"><meta itemprop=datePublished content="2023-10-20T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-20T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/blog/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><ul class=blog-posts><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/blog/003_powerpointsnap/>Quick hacks to make client-ready presentations</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li><li><span><i><time datetime=2023-06-22 pubdate>2023-06-22
</time></i></span><a href=/blog/001_overlap_joins/>Overlap Joins: Number of docker trucks in an interval</a></li></ul><div><a class=blog-tags href=/tags/approximate/>#approximate</a>
<a class=blog-tags href=/tags/category/>#category</a>
<a class=blog-tags href=/tags/faiss/>#faiss</a>
<a class=blog-tags href=/tags/graph/>#graph</a>
<a class=blog-tags href=/tags/nearest/>#nearest</a>
<a class=blog-tags href=/tags/neighbor/>#neighbor</a>
<a class=blog-tags href=/tags/network/>#network</a>
<a class=blog-tags href=/tags/networkx/>#networkx</a>
<a class=blog-tags href=/tags/polars/>#polars</a>
<a class=blog-tags href=/tags/powerpoint/>#powerpoint</a>
<a class=blog-tags href=/tags/ppt/>#ppt</a>
<a class=blog-tags href=/tags/representative/>#representative</a>
<a class=blog-tags href=/tags/samples/>#samples</a>
<a class=blog-tags href=/tags/vba/>#vba</a></div></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

927
public/blog/index.xml Normal file
View File

@@ -0,0 +1,927 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>blog on Avinash's Blog</title><link>https://avimallu.dev/blog/</link><description>Recent content in blog on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Fri, 20 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/blog/index.xml" rel="self" type="application/rss+xml"/><item><title>Quick hacks to make client-ready presentations</title><link>https://avimallu.dev/blog/003_powerpointsnap/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/003_powerpointsnap/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (&lt;em>decks&lt;/em> in consulting lingo - not even &lt;em>slide decks&lt;/em>). However, it was rather repetitive. Thus, was born PowerPointSnap.&lt;/p>
&lt;h1 id="what-is-it">What is it?&lt;/h1>
&lt;p>I&amp;rsquo;ll write this down as pointers.&lt;/p>
&lt;ol>
&lt;li>It&amp;rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.&lt;/li>
&lt;li>It&amp;rsquo;s Windows only - it&amp;rsquo;s unlikely to work on MacOS.&lt;/li>
&lt;li>It&amp;rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.&lt;/li>
&lt;/ol>
&lt;h1 id="how-do-i-get-it">How do I get it?&lt;/h1>
&lt;p>The project is available on this &lt;a href="https://github.com/avimallu/PowerPointSnap">Github repo&lt;/a>. The instructions to install it are available there, but here&amp;rsquo;s the down-low:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p>
<h1 id="what-is-it">What is it?</h1>
<p>I&rsquo;ll write this down as pointers.</p>
<ol>
<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li>
<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li>
<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li>
</ol>
<h1 id="how-do-i-get-it">How do I get it?</h1>
<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>
<ol>
<li>Download the Snap.ppam file to your system.</li>
<li>Enable the developer options.</li>
<li>Go to the Developer tab, and click on PowerPoint Add-ins.</li>
<li>Click on Add New. Choose the location of the file you just dowloaded. Click Close.</li>
<li>To uninstall, repeat the process, and simply click on Remove this time.</li>
</ol>
<h1 id="what-can-i-do-with-it">What can I do with it?</h1>
<p>Frankly, a LOT. The base concept of this tool is:</p>
<ol>
<li>&ldquo;Set&rdquo; a shape as the one you want to copy a property from.</li>
<li>Select any property from the list to automatically apply it.</li>
</ol>
<p>Here&rsquo;s a non-exhaustive list of all the options available.</p>
<h2 id="apply-properties-of-shapes-directly">Apply properties of shapes directly</h2>
<p>This is the part of the interface that can be used for shapes (which include charts and tables).</p>
<p><img src="/blog/003_powerpointsnap/01_Shapes.png" alt="The UI for copying shape properties"></p>
<p>To use, first select a <em>shape</em> object, click on &ldquo;Set&rdquo;. Then, choose the object you want to <em>Snap</em> its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.</p>
<p>Note that it&rsquo;s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use <em>Snap</em>.</p>
<h2 id="beautify-charts-with-snappable-properties">Beautify charts with <em>Snap</em>pable properties</h2>
<p>Charts are also supported, with dedicated features for it.</p>
<p><img src="/blog/003_powerpointsnap/02_Charts.png" alt="The UI for copying chart properties"></p>
<p>What do these features do? You should be able to hover over the option and get a tooltip that shows what it&rsquo;s capable of, but here&rsquo;s another summary just in case:</p>
<ol>
<li>Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the &ldquo;set&rdquo; chart to the one you&rsquo;ve selected. I couldn&rsquo;t put in just $x$ and $y$ here because Microsoft internally doesn&rsquo;t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn&rsquo;t work well yet for 3D charts.</li>
<li>Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all <em>look</em> exactly the same from a client perspective. But that&rsquo;s usually difficult if you&rsquo;ve already configured the charts a little - which can be remedied with this option!</li>
<li>Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you&rsquo;ve selected with the way it originally is in the &ldquo;set&rdquo; chart. The reason for this feature is simply to avoid going back to <em>Home</em> to click on the <em>Format Painter</em> option again.</li>
<li>Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.</li>
</ol>
<p>The next two options deserve their own section.</p>
<h2 id="customize-the-labels-programmatically">Customize the labels programmatically</h2>
<p>Your immediate senior in a consulting environment would frown at your chart, and then exclaim, &ldquo;I think that&rsquo;s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it&rsquo;s a one time thing!&rdquo;</p>
<p>It&rsquo;s <strong>never</strong> a one time affair. But don&rsquo;t worry, we have this nice feature to help us. If you click on the <em>Customize Label</em> option, you will get this (without the &ldquo;Set&rdquo; option):</p>
<p><img src="/blog/003_powerpointsnap/DataLabelsScreenshot.JPG" alt="The UI for customizing labels."></p>
<p>Never mind the rather unfriendly legend entries. They&rsquo;re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!</p>
<h3 id="screenshots-of-the-chart-snapability">Screenshots of the chart <em>snap</em>ability</h3>
<p>Of course, visuals will do it more justice. For example, look at this image:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_1.png" alt="Theres a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles arent centered."></p>
<p>Here&rsquo;s what you can do:</p>
<ol>
<li>Click on the left chart. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right chart, and then go through the following:
<ol>
<li>In <em>Shapes</em>, click on <em>Dim</em>. This will align the shapes of the chart.</li>
<li>Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.</li>
<li>You&rsquo;ll notice that the chart area doesn&rsquo;t still match, nor does the title.</li>
<li>In <em>Charts</em>, click on <em>Sync Plot Area</em> and <em>Sync Title Area</em>, and watch the magic unfold.</li>
<li>Now, click on the second chart, and click on &ldquo;Set&rdquo;. Let&rsquo;s align the axes of the first chart to the second one.</li>
<li>Click on the first chart, and then in <em>Charts</em>, click <em>Sync Value Axis</em>.</li>
</ol>
</li>
<li>Let&rsquo;s bring that senior&rsquo;s exclamation back into play - (s)he wants you to highlight <em>only</em> Profit labels, and that too every 2 iterations. To do this:
<ol>
<li>Click on <em>Customize Labels</em> after clicking on either chart.</li>
<li>You&rsquo;ll get the screen shown in the previous section. Make sure to adjust the values such that it&rsquo;s exactly like the screenshot there.</li>
<li>Click on &ldquo;Save and Run&rdquo;. This will <em>save</em> the configuration you&rsquo;ve selected, and <em>run</em> it on the chart you&rsquo;ve selected.</li>
<li>Click the other chart. Then, in <em>Charts</em>, click on <em>Rerun Customization</em>.</li>
</ol>
</li>
</ol>
<p>This is what your results should look like:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_2.png" alt="Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…"></p>
<p>Of course, getting those calculations right is a whole different thing that will need some work.</p>
<h2 id="align-table-dimensions">Align table dimensions</h2>
<p>Oftentimes, you have two tables that show similar values&hellip; you know the drill. Here&rsquo;s what you can do in a scenario such as this:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_1.png" alt="Similar data, but vastly different tables."></p>
<p>This is what the <em>Tables</em> section of the tool looks like:</p>
<p><img src="/blog/003_powerpointsnap/03_Tables.png" alt="The UI for Tables"></p>
<p>To align these tables together,</p>
<ol>
<li>Click on the left table. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right table.</li>
<li>Click on <em>Shapes</em>, inside it, <em>Dim</em>. Now the shapes of the table are the same.</li>
<li>In <em>Tables</em>, click on <em>Sync Column Widths</em>. Now the columns are also the same.</li>
<li>If you try to align by rows, it fails because the number of rows are not the same in the two tables.</li>
</ol>
<p>Here&rsquo;s what you&rsquo;ll end up with:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_2.png" alt="Similar data, and similar enough tables."></p>
<p>Pretty neat, eh?</p>
]]></content:encoded></item><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item><item><title>Overlap Joins: Number of docker trucks in an interval</title><link>https://avimallu.dev/blog/001_overlap_joins/</link><pubDate>Thu, 22 Jun 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/001_overlap_joins/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>I stumbled upon an interesting &lt;a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question&lt;/a> that was linked &lt;a href="https://github.com/pola-rs/polars/issues/9467">via an issue&lt;/a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.&lt;/p>
&lt;p>I&amp;rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.&lt;/p>
&lt;h1 id="problem-statement">Problem Statement&lt;/h1>
&lt;p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&amp;rsquo;s ID.&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="/blog/001_overlap_joins/overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<details markdown="1"><summary>SQL with explanation.</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div></details>
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<details markdown="1"><summary>DuckDB query after before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,5 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Categories | Avinash's Blog</title><meta name=title content="Categories"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/categories/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Categories"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Categories"><meta itemprop=name content="Categories"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/categories/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Categories"</h3><ul class=blog-posts><li>No posts yet</li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Categories on Avinash's Blog</title><link>https://avimallu.dev/categories/</link><description>Recent content in Categories on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><atom:link href="https://avimallu.dev/categories/index.xml" rel="self" type="application/rss+xml"/></channel></rss>

BIN
public/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

1
public/herman.min.css vendored Normal file
View File

@@ -0,0 +1 @@
:root{font-size:62.5%;--color-dark:#181a20;--color-light:#fafafa;--color-primary:#1a8fe3;--size:1rem;--spacing:calc(var(--size) * 2.4)}body{background:var(--color-dark);color:var(--color-light);padding:4rem;font-family:Avenir,avenir next lt pro,Montserrat,Corbel,urw gothic,source-sans-pro,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol,noto color emoji;font-size:calc(var(--size) * 1.8);line-height:1.5;min-height:80vh;max-width:1600px;margin:0 auto;word-wrap:break-word}header,main,footer{max-width:70ch;margin-inline:auto}header{padding-bottom:var(--spacing)}nav a,a.blog-tags{margin-right:calc(var(--spacing)/2)}a.blog-tags{line-height:2}main{padding-bottom:var(--spacing)}footer{text-align:center;padding-top:var(--spacing)}a{color:currentColor;text-decoration-color:var(--color-primary);text-decoration-thickness:.3ex;text-underline-offset:.3ex}a:hover{text-decoration-thickness:.4ex}img{display:block;max-width:100%;height:auto}h1,h2,h3,h4{font-weight:700;line-height:1.3}h1{font-size:calc(var(--size) * 4.2)}h2{font-size:calc(var(--size) * 3.4)}h3{font-size:calc(var(--size) * 2.6)}h4{font-size:calc(var(--size) * 1.8)}ul,ol{padding-inline-start:var(--spacing)}li{margin-block-start:var(--spacing)}blockquote{padding-inline-start:var(--spacing);border-inline-start:.2em solid;font-style:italic;max-width:50ch}:is(h1,h2,h3,h4,blockquote){margin-block-end:calc(var(--spacing)/2)}:is(h1,h2,h3,h4)+*{margin-block-start:calc(var(--spacing)/3)}:is(h1,h2,h3,h4)+:where(h2,h3,h4){margin-block-start:calc(var(--spacing) * 2)}.title{text-decoration:none}.title h1{font-size:calc(var(--size) * 3.4);margin-top:calc(var(--spacing)/2)}ul.blog-posts{list-style-type:none;padding:unset}ul.blog-posts li{display:flex;flex-direction:column}ul.blog-posts li span{min-width:11ch}p.byline{opacity:.5}code{font-family:ui-monospace,cascadia code,source code pro,Menlo,Consolas,dejavu sans mono,monospace;padding:2px calc(var(--spacing)/4);background-color:#282a36;font-size:calc(var(--size) * 1.4)}pre code{display:block;padding:var(--spacing);overflow-x:auto;-webkit-text-size-adjust:100%;-moz-text-size-adjust:100%}table{width:100%}table,th,td{border:1px solid;border-collapse:collapse;border-color:var(--color-light);padding:calc(var(--spacing)/2)}.disabled{color:currentColor;cursor:not-allowed;opacity:.5}@media screen and (min-width:600px){ul.blog-posts li{flex-direction:row;gap:calc(var(--spacing)/2)}}.skip-link{position:absolute;top:5;transform:translateY(-600%);transition:transform .5s;background-color:#181a20;padding:6px}.skip-link:focus{transform:translateY(0%)}figure{margin-inline-start:0;margin-inline-end:0}figcaption>p{margin-block-start:9px;text-align:center;font-style:italic}

11
public/index.html Normal file
View File

@@ -0,0 +1,11 @@
<!doctype html><html lang=en-US><head><meta name=generator content="Hugo 0.142.0"><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>about | Avinash's Blog</title><meta name=title content="about"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="about"><meta property="og:description" content="Hi there! My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and Im a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.
Whats here? Youll find the following:
A few posts where I show up some creative ways that Ive solved complex problems. Links to projects that Ive worked on, or have contributed to. An assortment of random things Ive found interesting. Contact You can find me on:"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="about"><meta name=twitter:description content="Hi there! My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and Im a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.
Whats here? Youll find the following:
A few posts where I show up some creative ways that Ive solved complex problems. Links to projects that Ive worked on, or have contributed to. An assortment of random things Ive found interesting. Contact You can find me on:"><meta itemprop=name content="about"><meta itemprop=description content="Hi there! My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and Im a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.
Whats here? Youll find the following:
A few posts where I show up some creative ways that Ive solved complex problems. Links to projects that Ive worked on, or have contributed to. An assortment of random things Ive found interesting. Contact You can find me on:"><meta itemprop=datePublished content="2023-10-20T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-20T00:00:00+00:00"><meta itemprop=wordCount content="94"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><h1 id=hi-there>Hi there!</h1><p>My name is Avinash Mallya (pronounced Uh-vin-aash Muh-ll-yeah), and I&rsquo;m a data scientist by profession. This website is a creative outlet, and my piece of the internet where I show off.</p><h1 id=whats-here>What&rsquo;s here?</h1><p>You&rsquo;ll find the following:</p><ul><li>A few posts where I show up some creative ways that I&rsquo;ve solved complex problems.</li><li>Links to projects that I&rsquo;ve worked on, or have contributed to.</li><li>An assortment of random things I&rsquo;ve found interesting.</li></ul><h1 id=contact>Contact</h1><p>You can find me on:</p><ul><li><a href=https://www.linkedin.com/in/avinash-mallya>LinkedIn</a></li><li><a href=https://github.com/avimallu>Github</a></li></ul><p>Please reach out via one of the above if you want to talk.</p></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

950
public/index.xml Normal file
View File

@@ -0,0 +1,950 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Avinash's Blog</title><link>https://avimallu.dev/</link><description>Recent content on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Fri, 20 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/index.xml" rel="self" type="application/rss+xml"/><item><title>projects</title><link>https://avimallu.dev/projects/</link><pubDate>Mon, 01 Jan 0001 00:00:00 +0000</pubDate><guid>https://avimallu.dev/projects/</guid><description>&lt;p>Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.&lt;/p>
&lt;h1 id="featured-projects">Featured projects&lt;/h1>
&lt;ol>
&lt;li>&lt;a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker&lt;/a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. &lt;a href="https://github.com/avimallu/BorrowChecker">Repository link&lt;/a>.&lt;/li>
&lt;li>&lt;a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap&lt;/a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying &lt;a href="https://avimallu.dev/blog/003_powerpointsnap/">blog post&lt;/a>.&lt;/li>
&lt;/ol>
&lt;h1 id="other-work-or-contributions">Other work or contributions&lt;/h1>
&lt;ol>
&lt;li>&lt;a href="https://github.com/avimallu/IntelligentReceiptSplitter">IntelligentReceiptSplitter&lt;/a>: A relatively simple predecessor to &lt;a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker&lt;/a> that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.&lt;/li>
&lt;li>&lt;a href="https://github.com/avimallu/r.data.table.funs">r.data.table.funs&lt;/a>: A very small set of R functions that use &lt;code>data.table&lt;/code>, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.&lt;/li>
&lt;li>I &lt;a href="https://github.com/pola-rs/polars-book/pull/364">wrote&lt;/a> &lt;a href="https://github.com/pola-rs/polars-book/pull/358">several&lt;/a> &lt;a href="https://github.com/pola-rs/polars-book/pull/365/files">chapters&lt;/a> of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like &lt;code>data.table&lt;/code> and &lt;code>dplyr&lt;/code> dominated), so I was eager to make it better for everybody making the switch.&lt;/li>
&lt;/ol></description><content:encoded><![CDATA[<p>Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.</p>
<h1 id="featured-projects">Featured projects</h1>
<ol>
<li><a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href="https://github.com/avimallu/BorrowChecker">Repository link</a>.</li>
<li><a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href="https://avimallu.dev/blog/003_powerpointsnap/">blog post</a>.</li>
</ol>
<h1 id="other-work-or-contributions">Other work or contributions</h1>
<ol>
<li><a href="https://github.com/avimallu/IntelligentReceiptSplitter">IntelligentReceiptSplitter</a>: A relatively simple predecessor to <a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a> that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.</li>
<li><a href="https://github.com/avimallu/r.data.table.funs">r.data.table.funs</a>: A very small set of R functions that use <code>data.table</code>, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.</li>
<li>I <a href="https://github.com/pola-rs/polars-book/pull/364">wrote</a> <a href="https://github.com/pola-rs/polars-book/pull/358">several</a> <a href="https://github.com/pola-rs/polars-book/pull/365/files">chapters</a> of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like <code>data.table</code> and <code>dplyr</code> dominated), so I was eager to make it better for everybody making the switch.</li>
</ol>
]]></content:encoded></item><item><title>Quick hacks to make client-ready presentations</title><link>https://avimallu.dev/blog/003_powerpointsnap/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/003_powerpointsnap/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (&lt;em>decks&lt;/em> in consulting lingo - not even &lt;em>slide decks&lt;/em>). However, it was rather repetitive. Thus, was born PowerPointSnap.&lt;/p>
&lt;h1 id="what-is-it">What is it?&lt;/h1>
&lt;p>I&amp;rsquo;ll write this down as pointers.&lt;/p>
&lt;ol>
&lt;li>It&amp;rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.&lt;/li>
&lt;li>It&amp;rsquo;s Windows only - it&amp;rsquo;s unlikely to work on MacOS.&lt;/li>
&lt;li>It&amp;rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.&lt;/li>
&lt;/ol>
&lt;h1 id="how-do-i-get-it">How do I get it?&lt;/h1>
&lt;p>The project is available on this &lt;a href="https://github.com/avimallu/PowerPointSnap">Github repo&lt;/a>. The instructions to install it are available there, but here&amp;rsquo;s the down-low:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p>
<h1 id="what-is-it">What is it?</h1>
<p>I&rsquo;ll write this down as pointers.</p>
<ol>
<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li>
<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li>
<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li>
</ol>
<h1 id="how-do-i-get-it">How do I get it?</h1>
<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>
<ol>
<li>Download the Snap.ppam file to your system.</li>
<li>Enable the developer options.</li>
<li>Go to the Developer tab, and click on PowerPoint Add-ins.</li>
<li>Click on Add New. Choose the location of the file you just dowloaded. Click Close.</li>
<li>To uninstall, repeat the process, and simply click on Remove this time.</li>
</ol>
<h1 id="what-can-i-do-with-it">What can I do with it?</h1>
<p>Frankly, a LOT. The base concept of this tool is:</p>
<ol>
<li>&ldquo;Set&rdquo; a shape as the one you want to copy a property from.</li>
<li>Select any property from the list to automatically apply it.</li>
</ol>
<p>Here&rsquo;s a non-exhaustive list of all the options available.</p>
<h2 id="apply-properties-of-shapes-directly">Apply properties of shapes directly</h2>
<p>This is the part of the interface that can be used for shapes (which include charts and tables).</p>
<p><img src="/blog/003_powerpointsnap/01_Shapes.png" alt="The UI for copying shape properties"></p>
<p>To use, first select a <em>shape</em> object, click on &ldquo;Set&rdquo;. Then, choose the object you want to <em>Snap</em> its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.</p>
<p>Note that it&rsquo;s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use <em>Snap</em>.</p>
<h2 id="beautify-charts-with-snappable-properties">Beautify charts with <em>Snap</em>pable properties</h2>
<p>Charts are also supported, with dedicated features for it.</p>
<p><img src="/blog/003_powerpointsnap/02_Charts.png" alt="The UI for copying chart properties"></p>
<p>What do these features do? You should be able to hover over the option and get a tooltip that shows what it&rsquo;s capable of, but here&rsquo;s another summary just in case:</p>
<ol>
<li>Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the &ldquo;set&rdquo; chart to the one you&rsquo;ve selected. I couldn&rsquo;t put in just $x$ and $y$ here because Microsoft internally doesn&rsquo;t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn&rsquo;t work well yet for 3D charts.</li>
<li>Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all <em>look</em> exactly the same from a client perspective. But that&rsquo;s usually difficult if you&rsquo;ve already configured the charts a little - which can be remedied with this option!</li>
<li>Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you&rsquo;ve selected with the way it originally is in the &ldquo;set&rdquo; chart. The reason for this feature is simply to avoid going back to <em>Home</em> to click on the <em>Format Painter</em> option again.</li>
<li>Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.</li>
</ol>
<p>The next two options deserve their own section.</p>
<h2 id="customize-the-labels-programmatically">Customize the labels programmatically</h2>
<p>Your immediate senior in a consulting environment would frown at your chart, and then exclaim, &ldquo;I think that&rsquo;s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it&rsquo;s a one time thing!&rdquo;</p>
<p>It&rsquo;s <strong>never</strong> a one time affair. But don&rsquo;t worry, we have this nice feature to help us. If you click on the <em>Customize Label</em> option, you will get this (without the &ldquo;Set&rdquo; option):</p>
<p><img src="/blog/003_powerpointsnap/DataLabelsScreenshot.JPG" alt="The UI for customizing labels."></p>
<p>Never mind the rather unfriendly legend entries. They&rsquo;re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!</p>
<h3 id="screenshots-of-the-chart-snapability">Screenshots of the chart <em>snap</em>ability</h3>
<p>Of course, visuals will do it more justice. For example, look at this image:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_1.png" alt="Theres a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles arent centered."></p>
<p>Here&rsquo;s what you can do:</p>
<ol>
<li>Click on the left chart. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right chart, and then go through the following:
<ol>
<li>In <em>Shapes</em>, click on <em>Dim</em>. This will align the shapes of the chart.</li>
<li>Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.</li>
<li>You&rsquo;ll notice that the chart area doesn&rsquo;t still match, nor does the title.</li>
<li>In <em>Charts</em>, click on <em>Sync Plot Area</em> and <em>Sync Title Area</em>, and watch the magic unfold.</li>
<li>Now, click on the second chart, and click on &ldquo;Set&rdquo;. Let&rsquo;s align the axes of the first chart to the second one.</li>
<li>Click on the first chart, and then in <em>Charts</em>, click <em>Sync Value Axis</em>.</li>
</ol>
</li>
<li>Let&rsquo;s bring that senior&rsquo;s exclamation back into play - (s)he wants you to highlight <em>only</em> Profit labels, and that too every 2 iterations. To do this:
<ol>
<li>Click on <em>Customize Labels</em> after clicking on either chart.</li>
<li>You&rsquo;ll get the screen shown in the previous section. Make sure to adjust the values such that it&rsquo;s exactly like the screenshot there.</li>
<li>Click on &ldquo;Save and Run&rdquo;. This will <em>save</em> the configuration you&rsquo;ve selected, and <em>run</em> it on the chart you&rsquo;ve selected.</li>
<li>Click the other chart. Then, in <em>Charts</em>, click on <em>Rerun Customization</em>.</li>
</ol>
</li>
</ol>
<p>This is what your results should look like:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_2.png" alt="Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…"></p>
<p>Of course, getting those calculations right is a whole different thing that will need some work.</p>
<h2 id="align-table-dimensions">Align table dimensions</h2>
<p>Oftentimes, you have two tables that show similar values&hellip; you know the drill. Here&rsquo;s what you can do in a scenario such as this:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_1.png" alt="Similar data, but vastly different tables."></p>
<p>This is what the <em>Tables</em> section of the tool looks like:</p>
<p><img src="/blog/003_powerpointsnap/03_Tables.png" alt="The UI for Tables"></p>
<p>To align these tables together,</p>
<ol>
<li>Click on the left table. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right table.</li>
<li>Click on <em>Shapes</em>, inside it, <em>Dim</em>. Now the shapes of the table are the same.</li>
<li>In <em>Tables</em>, click on <em>Sync Column Widths</em>. Now the columns are also the same.</li>
<li>If you try to align by rows, it fails because the number of rows are not the same in the two tables.</li>
</ol>
<p>Here&rsquo;s what you&rsquo;ll end up with:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_2.png" alt="Similar data, and similar enough tables."></p>
<p>Pretty neat, eh?</p>
]]></content:encoded></item><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item><item><title>Overlap Joins: Number of docker trucks in an interval</title><link>https://avimallu.dev/blog/001_overlap_joins/</link><pubDate>Thu, 22 Jun 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/001_overlap_joins/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>I stumbled upon an interesting &lt;a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question&lt;/a> that was linked &lt;a href="https://github.com/pola-rs/polars/issues/9467">via an issue&lt;/a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.&lt;/p>
&lt;p>I&amp;rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.&lt;/p>
&lt;h1 id="problem-statement">Problem Statement&lt;/h1>
&lt;p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&amp;rsquo;s ID.&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="/blog/001_overlap_joins/overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<details markdown="1"><summary>SQL with explanation.</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div></details>
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<details markdown="1"><summary>DuckDB query after before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
]]></content:encoded></item></channel></rss>

1
public/original.min.css vendored Normal file
View File

@@ -0,0 +1 @@
code{text-size-adjust:100%;-ms-text-size-adjust:100%;-moz-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{font-family:Verdana,sans-serif;margin:auto;padding:20px;max-width:720px;text-align:left;background-color:#1d1f27;word-wrap:break-word;overflow-wrap:break-word;line-height:1.5;color:#c9d1d9}h1,h2,h3,h4,h5,h6,strong,b{color:#eee}a{color:#8cc2dd}.title{text-decoration:none;border:0}.title h1{font-size:24px;margin:19.92px 0}.title span{font-weight:400}nav a{margin-right:10px}textarea{background-color:#252525;color:#ddd;width:100%;font-size:16px}input{background-color:#252525;color:#ddd;font-size:16px}content{line-height:1.6}table{width:100%}table,th,td{border:1px solid;border-collapse:collapse;border-color:#c9d1d9;padding:5px}img{max-width:100%;height:auto}code{padding:2px 5px;color:#f8f8f2;background-color:#282a36}pre code{display:block;padding:20px;white-space:pre-wrap;font-size:14px;overflow-x:auto;text-wrap:nowrap}blockquote{border-left:1px solid #999;color:#ccc;padding-left:20px;font-style:italic}footer{padding:25px;text-align:center}.helptext{color:#aaa;font-size:small}.errorlist{color:#eba613;font-size:small}ul.blog-posts{list-style-type:none;padding:unset}ul.blog-posts li{display:flex;margin-bottom:10px}ul.blog-posts li span{flex:0 0 130px}ul.blog-posts li a:visited{color:#8b6fcb}a.blog-tags{line-height:2;margin-right:12px}h3.blog-filter{margin-bottom:0}.disabled{color:currentColor;cursor:not-allowed;opacity:.7}p.byline{font-style:italic}.skip-link{position:absolute;top:5;transform:translateY(-600%);transition:transform .5s;background-color:#1d1f27;padding:6px}.skip-link:focus{transform:translateY(0%)}figure{margin-inline-start:0;margin-inline-end:0}figcaption>p{margin-block-start:0;text-align:center;font-style:italic;color:#ccc}

View File

@@ -0,0 +1,562 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Overlap Joins | Avinash&#39;s Blog</title>
<meta name="title" content="Overlap Joins" />
<meta name="description" content="Premise
I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement
Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID." />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/posts/001_overlap_joins/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="Overlap Joins">
<meta property="og:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="article">
<meta property="article:section" content="posts">
<meta property="article:published_time" content="2023-06-22T17:27:50-04:00">
<meta property="article:modified_time" content="2023-06-22T17:27:50-04:00">
<meta property="fb:admins" content="0000000000">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Overlap Joins">
<meta name="twitter:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta name="twitter:site" content="@example">
<meta itemprop="name" content="Overlap Joins">
<meta itemprop="description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta itemprop="datePublished" content="2023-06-22T17:27:50-04:00">
<meta itemprop="dateModified" content="2023-06-22T17:27:50-04:00">
<meta itemprop="wordCount" content="3078">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/herman.min.css" rel="stylesheet">
<link href="/syntax.min.css" rel="stylesheet">
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href='http://localhost:1313/index.xml'>RSS</a>
</nav>
</header>
<main id="main-content">
<h1>Overlap Joins</h1>
<p class="byline">
<time datetime='2023-06-22' pubdate>
2023-06-22
</time>
</p>
<content>
<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="./assets/001_overlap_joins/overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<details markdown="1"><summary>SQL with explanation.</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div></details>
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<details markdown="1"><summary>DuckDB query after before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
</content>
<p>
</p>
<p>
<a href='mailto:me@example.com?subject=Reply%20to%20"Overlap%20Joins"'>
Reply to this post by email ↪
</a>
</p>
</main>
<footer><small>
Avinash Mallya | Made with <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>
</small></footer>
</body>
</html>

106
public/posts/index.html Normal file
View File

@@ -0,0 +1,106 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Posts | Avinash&#39;s Blog</title>
<meta name="title" content="Posts" />
<meta name="description" content="Bear Cub Demo" />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/posts/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="Posts">
<meta property="og:description" content="Bear Cub Demo">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="website">
<meta property="fb:admins" content="0000000000">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Posts">
<meta name="twitter:description" content="Bear Cub Demo">
<meta name="twitter:site" content="@example">
<meta itemprop="name" content="Posts">
<meta itemprop="description" content="Bear Cub Demo">
<meta itemprop="datePublished" content="2023-06-22T17:27:50-04:00">
<meta itemprop="dateModified" content="2023-06-22T17:27:50-04:00">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/herman.min.css" rel="stylesheet">
<link rel="alternate" type="application/rss+xml" href="http://localhost:1313/posts/index.xml" title="Avinash's Blog" />
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href='http://localhost:1313/index.xml'>RSS</a>
</nav>
</header>
<main id="main-content">
<content>
<ul class="blog-posts">
<li>
<span>
<i>
<time datetime='2023-06-22' pubdate>
2023-06-22
</time>
</i>
</span>
<a href="/posts/001_overlap_joins/">Overlap Joins</a>
</li>
</ul>
<div>
</div>
</content>
</main>
<footer><small>
Avinash Mallya | Made with <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>
</small></footer>
</body>
</html>

463
public/posts/index.xml Normal file
View File

@@ -0,0 +1,463 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Posts on Avinash&#39;s Blog</title>
<link>http://localhost:1313/posts/</link>
<description>Recent content in Posts on Avinash&#39;s Blog</description>
<generator>Hugo -- gohugo.io</generator>
<language>en-US</language>
<managingEditor>me@example.com (John Doe)</managingEditor>
<webMaster>me@example.com (John Doe)</webMaster>
<copyright>Avinash Mallya</copyright>
<lastBuildDate>Thu, 22 Jun 2023 17:27:50 -0400</lastBuildDate>
<atom:link href="http://localhost:1313/posts/index.xml" rel="self" type="application/rss+xml" />
<item>
<title>Overlap Joins</title>
<link>http://localhost:1313/posts/001_overlap_joins/</link>
<pubDate>Thu, 22 Jun 2023 17:27:50 -0400</pubDate><author>me@example.com (John Doe)</author>
<guid>http://localhost:1313/posts/001_overlap_joins/</guid>
<description>&lt;h1 id=&#34;premise&#34;&gt;Premise&lt;/h1&gt;&#xA;&lt;p&gt;I stumbled upon an interesting &lt;a href=&#34;https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period&#34;&gt;Stackoverflow question&lt;/a&gt; that was linked &lt;a href=&#34;https://github.com/pola-rs/polars/issues/9467&#34;&gt;via an issue&lt;/a&gt; on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.&lt;/p&gt;&#xA;&lt;p&gt;I&amp;rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.&lt;/p&gt;&#xA;&lt;h1 id=&#34;problem-statement&#34;&gt;Problem Statement&lt;/h1&gt;&#xA;&lt;p&gt;Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&amp;rsquo;s ID.&lt;/p&gt;</description>
<content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="./assets/001_overlap_joins/overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<details markdown="1"><summary>SQL with explanation.</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div></details>
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<details markdown="1"><summary>DuckDB query plan before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<details markdown="1"><summary>DuckDB query after before optimization</summary>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div></details>
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
]]></content:encoded>
</item>
</channel>
</rss>

View File

@@ -0,0 +1,110 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>My First Post | Avinash&#39;s Blog</title>
<meta name="title" content="My First Post" />
<meta name="description" content="Fucking hell." />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/posts/my-first-post/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="My First Post">
<meta property="og:description" content="Fucking hell.">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="article">
<meta property="article:section" content="posts">
<meta property="article:published_time" content="2025-09-13T17:55:17-04:00">
<meta property="article:modified_time" content="2025-09-13T17:55:17-04:00">
<meta property="fb:admins" content="0000000000">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="My First Post">
<meta name="twitter:description" content="Fucking hell.">
<meta name="twitter:site" content="@example">
<meta itemprop="name" content="My First Post">
<meta itemprop="description" content="Fucking hell.">
<meta itemprop="datePublished" content="2025-09-13T17:55:17-04:00">
<meta itemprop="dateModified" content="2025-09-13T17:55:17-04:00">
<meta itemprop="wordCount" content="2">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/herman.min.css" rel="stylesheet">
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href='http://localhost:1313/index.xml'>RSS</a>
</nav>
</header>
<main id="main-content">
<h1>My First Post</h1>
<p class="byline">
<time datetime='2025-09-13' pubdate>
2025-09-13
</time>
</p>
<content>
<p>Fucking hell.</p>
</content>
<p>
</p>
<p>
<a href='mailto:me@example.com?subject=Reply%20to%20"My%20First%20Post"'>
Reply to this post by email ↪
</a>
</p>
</main>
<footer><small>
Avinash Mallya | Made with <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>
</small></footer>
</body>
</html>

View File

@@ -1,9 +1,8 @@
---
title: Fast overlap joins in SQL, Python and R
permalink: /docked_trucks_in_interval
author: Avinash Mallya
tags: [python, polars, duckdb, R, data.table, foverlaps, overlap, join]
---
+++
date = '2023-06-22T17:27:50-04:00'
draft = false
title = 'Overlap Joins'
+++
# Premise

View File

@@ -0,0 +1,562 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Overlap Joins | Avinash&#39;s Blog</title>
<meta name="title" content="Overlap Joins" />
<meta name="description" content="Premise
I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement
Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID." />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/posts/post/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="Overlap Joins">
<meta property="og:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="article">
<meta property="article:section" content="posts">
<meta property="article:published_time" content="2023-06-22T17:27:50-04:00">
<meta property="article:modified_time" content="2023-06-22T17:27:50-04:00">
<meta property="fb:admins" content="0000000000">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Overlap Joins">
<meta name="twitter:description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta name="twitter:site" content="@example">
<meta itemprop="name" content="Overlap Joins">
<meta itemprop="description" content="Premise I stumbled upon an interesting Stackoverflow question that was linked via an issue on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.
Im more of a right-tool-for-the-job person, so I tried to find a better solution.
Problem Statement Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the trucks ID.">
<meta itemprop="datePublished" content="2023-06-22T17:27:50-04:00">
<meta itemprop="dateModified" content="2023-06-22T17:27:50-04:00">
<meta itemprop="wordCount" content="3065">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/herman.min.css" rel="stylesheet">
<link href="/syntax.min.css" rel="stylesheet">
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href='http://localhost:1313/index.xml'>RSS</a>
</nav>
</header>
<main id="main-content">
<h1>Overlap Joins</h1>
<p class="byline">
<time datetime='2023-06-22' pubdate>
2023-06-22
</time>
</p>
<content>
<h1 id="premise">Premise</h1>
<p>I stumbled upon an interesting <a href="https://stackoverflow.com/questions/76488314/polars-count-unique-values-over-a-time-period">Stackoverflow question</a> that was linked <a href="https://github.com/pola-rs/polars/issues/9467">via an issue</a> on Polars github repo. The OP asked for a pure Polars solution. At the time of answering the question Polars did not have support for non-equi joins, and any solution using it would be pretty cumbersome.</p>
<p>I&rsquo;m more of a right-tool-for-the-job person, so I tried to find a better solution.</p>
<h1 id="problem-statement">Problem Statement</h1>
<p>Suppose we have a dataset that captures the arrival and departure times of trucks at a station, along with the truck&rsquo;s ID.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span> <span class="c1"># if you don&#39;t have polars, run </span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="c1"># pip install &#39;polars[all]&#39;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">from_repr</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ 2023-01-01 06:23:47 ┆ 2023-01-01 06:25:08 ┆ A1 │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ 2023-01-01 06:26:42 ┆ 2023-01-01 06:28:02 ┆ A1 │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">│ 2023-01-01 06:30:20 ┆ 2023-01-01 06:35:01 ┆ A5 │
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ 2023-01-01 06:33:09 ┆ 2023-01-01 06:36:01 ┆ B3 │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ 2023-01-01 06:34:08 ┆ 2023-01-01 06:39:49 ┆ C3 │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ 2023-01-01 06:36:40 ┆ 2023-01-01 06:38:34 ┆ A6 │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ 2023-01-01 06:37:43 ┆ 2023-01-01 06:40:48 ┆ A5 │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ 2023-01-01 06:39:48 ┆ 2023-01-01 06:46:10 ┆ A6 │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><p>We want to identify the number of trucks docked at any given time within a threshold of 1 minute <em>prior</em> to the arrival time of a truck, and 1 minute <em>after</em> the departure of a truck. Equivalently, this means that we need to calculate the number of trucks within a specific window for each row of the data.</p>
<h1 id="finding-a-solution-to-the-problem">Finding a solution to the problem</h1>
<h2 id="evaluate-for-a-specific-row">Evaluate for a specific row</h2>
<p>Before we find a general solution to this problem, let&rsquo;s consider a specific row to understand the problem better:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="s2">┌─────────────────────┬─────────────────────┬─────┐
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="s2">│ arrival_time ┆ departure_time ┆ ID │
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="s2">│ --- ┆ --- ┆ --- │
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="s2">│ datetime[μs] ┆ datetime[μs] ┆ str │
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="s2">╞═════════════════════╪═════════════════════╪═════╡
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="s2">│ 2023-01-01 06:32:06 ┆ 2023-01-01 06:33:48 ┆ A6 │
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="s2">└─────────────────────┴─────────────────────┴─────┘
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span></span></span></code></pre></div><p>For this row, we need to find the number of trucks that are there between <code>2023-01-01 06:31:06</code> (1 minute prior to the <code>arrival_time</code> and <code>2023-01-01 06:34:48</code> (1 minute post the <code>departure_time</code>). Manually going through the original dataset, we see that <code>B3</code>, <code>C3</code>, <code>A6</code> and <code>A5</code> are the truck IDs that qualify - they all are at the station in a duration that is between <code>2023-01-01 06:31:06</code> and <code>2023-01-01 06:34:48</code>.</p>
<h2 id="visually-deriving-an-algorithm">Visually deriving an algorithm</h2>
<p>There are many cases that will qualify a truck to be present in the overlap window defined by a particular row. Specifically for the example above, we have (this visualization is generalizable, because for each row we can calculate without much difficulty the overlap <em>window</em> relative to the arrival and departure times):</p>
<p><img src="./assets/001_overlap_joins/overlap_algorithm.png" alt="The five different ways a period can overlap."></p>
<p>Take some time to absorb these cases - it&rsquo;s important for the part where we write the code for the solution. Note that we need to actually tell our algorithm to filter only for Cases 2, 3 and 4, since Cases 1 and 5 will not satisfy our requirements.</p>
<h2 id="writing-an-sql-query-based-on-the-algorithm">Writing an SQL query based on the algorithm</h2>
<p>In theory, we can use any language that has the capability to define rules that meet our algorithmic requirements outlined in the above section to find the solution. Why choose SQL? It&rsquo;s often able to convey elegantly the logic that was used to execute the algorithm; and while it does come with excessive verbosity at times, it doesn&rsquo;t quite in this case.</p>
<p>Note here that we run SQL in Python with almost no setup or boilerplate code - so this is a Python based solution as well (although not quite Pythonic!).</p>
<h3 id="introducing-the-duckdb-package">Introducing the DuckDB package</h3>
<p>Once again, in theory, any SQL package or language can be used. Far too few however meet the ease-of-use that <a href="https://duckdb.org/">DuckDB</a> provides:</p>
<ol>
<li>no expensive set-up time (meaning no need for setting up databases, even temporary ones),</li>
<li>no dependencies (other than DuckDB itself, just <code>pip install duckdb</code>),</li>
<li>some very <a href="https://duckdb.org/2022/05/04/friendlier-sql.html">friendly SQL extensions</a>, and</li>
<li>ability to work directly on Polars and Pandas DataFrames without conversions</li>
</ol>
<p>all with <a href="https://duckdblabs.github.io/db-benchmark/">mind-blowing speed</a> that stands shoulder-to-shoulder with Polars. We&rsquo;ll also use a few advanced SQL concepts noted below.</p>
<h4 id="self-joins">Self-joins</h4>
<p>This should be a familiar, albeit not often used concept - a join of a table with itself is a self join. There are few cases where such an operation would make sense, and this happens to be one of them.</p>
<h4 id="a-bullet-train-recap-of-non-equi-joins">A bullet train recap of non-equi joins</h4>
<p>A key concept that we&rsquo;ll use is the idea of joining on a <em>range</em> of values rather than a specific value. That is, instead of the usual <code>LEFT JOIN ON A.column = B.column</code>, we can do <code>LEFT JOIN ON A.column &lt;= B.column</code> for one row in table <code>A</code> to match to multiple rows in <code>B</code>. DuckDB has a <a href="https://duckdb.org/2022/05/27/iejoin.html">blog post</a> that outlines this join in detail, including fast implementation.</p>
<h4 id="the-concept-of-list-columns">The concept of <code>LIST</code> columns</h4>
<p>DuckDB has first class support for <code>LIST</code> columns - that is, each row in a <code>LIST</code> column can have a varying length (much like a Python <code>list</code>), but must have the exact same datatype (like R&rsquo;s <code>vector</code>). Using list columns allow us to eschew the use of an additional <code>GROUP BY</code> operation on top of a <code>WHERE</code> filter or <code>SELECT DISTINCT</code> operation, since we can directly perform those on the <code>LIST</code> column itself.</p>
<h4 id="date-algebra">Date algebra</h4>
<p>Dates can be rather difficult to handle well in most tools and languages, with several packages purpose built to make handling them easier - <a href="https://lubridate.tidyverse.org/">lubridate</a> from the <a href="https://www.tidyverse.org/">tidyverse</a> is a stellar example. Thankfully, DuckDB provides a similar swiss-knife set of tools to deal with it, including specifying <code>INTERVAL</code>s (a special data type that represent a period of time independent of specific time values) to modify <code>TIMESTAMP</code> values using addition or subtraction.</p>
<h3 id="tell-me-the-query-please">Tell me the query, PLEASE!</h3>
<p>Okay - had a lot of background. Let&rsquo;s have at it! The query by itself in SQL is (see immediately below for runnable code in Python):</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">DATEDIFF</span><span class="p">(</span><span class="s1">&#39;seconds&#39;</span><span class="p">,</span><span class="w"> </span><span class="n">arrival_time</span><span class="p">,</span><span class="w"> </span><span class="n">departure_time</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">duration</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">((</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">))</span><span class="w">
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>A small, succinct query such as this will need a bit of explanation to take it all in. Here&rsquo;s one below, reproducible in Python (make sure to install <code>duckdb</code> first!). Expand it to view.</p>
<!-- raw HTML omitted -->
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> ,A.window_open
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.window_close
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> -- LIST aggregates the values into a LIST column
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> -- and LIST_DISTINCT finds the unique values in it
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> -- finally, LIST_UNIQUE calculates the unique number of values in it
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2"> FROM (
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2"> ,arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2"> ,departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2"> FROM data -- remember we defined data as the Polars DataFrame with our truck station data
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2"> ) A
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2"> LEFT JOIN (
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2"> SELECT
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2"> *
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2"> -- This is the time, in seconds between the arrival and departure of
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2"> -- each truck PER ROW in the original data-frame
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2"> ,DATEDIFF(&#39;seconds&#39;, arrival_time, departure_time) AS duration
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2"> FROM data -- this is where we perform a self-join
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2"> ) B
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2"> ON (
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2"> -- Case 2 in the diagram;
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2"> (B.arrival_time &lt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2"> -- Adding the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2"> -- is at least ENDING AFTER the start of the overlap window
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2"> (B.arrival_time + TO_SECONDS(B.duration)) &gt;= A.window_open) OR
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="s2"> -- Case 3 in the diagram - the simplest of all five cases
</span></span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="s2"> B.departure_time &lt;= A.window_close) OR
</span></span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="s2">
</span></span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="s2"> -- Case 4 in the digram;
</span></span></span><span class="line"><span class="ln">43</span><span class="cl"><span class="s2"> (B.arrival_time &gt;= A.window_open AND
</span></span></span><span class="line"><span class="ln">44</span><span class="cl"><span class="s2"> -- Subtracting the duration here makes sure that the second interval
</span></span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="s2"> -- STARTS BEFORE the end of the overlap window.
</span></span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="s2"> (B.departure_time - TO_SECONDS(B.duration)) &lt;= A.window_close)
</span></span></span><span class="line"><span class="ln">47</span><span class="cl"><span class="s2"> )
</span></span></span><span class="line"><span class="ln">48</span><span class="cl"><span class="s2"> GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">49</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span></span></span></code></pre></div><!-- raw HTML omitted -->
<p>The output of this query is:</p>
<pre tabindex="0"><code>&#34;&#34;&#34;
┌─────────────────────┬─────────────────────┬─────────────────────┬───┬──────────────────┬────────────────────┐
│ arrival_time │ departure_time │ window_open │ … │ docked_trucks │ docked_truck_count │
│ timestamp │ timestamp │ timestamp │ │ varchar[] │ uint64 │
├─────────────────────┼─────────────────────┼─────────────────────┼───┼──────────────────┼────────────────────┤
│ 2023-01-01 06:23:47 │ 2023-01-01 06:25:08 │ 2023-01-01 06:22:47 │ … │ [A1] │ 1 │
│ 2023-01-01 06:26:42 │ 2023-01-01 06:28:02 │ 2023-01-01 06:25:42 │ … │ [A1] │ 1 │
│ 2023-01-01 06:30:20 │ 2023-01-01 06:35:01 │ 2023-01-01 06:29:20 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:32:06 │ 2023-01-01 06:33:48 │ 2023-01-01 06:31:06 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:33:09 │ 2023-01-01 06:36:01 │ 2023-01-01 06:32:09 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:34:08 │ 2023-01-01 06:39:49 │ 2023-01-01 06:33:08 │ … │ [B3, C3, A6, A5] │ 4 │
│ 2023-01-01 06:36:40 │ 2023-01-01 06:38:34 │ 2023-01-01 06:35:40 │ … │ [A5, A6, C3, B3] │ 4 │
│ 2023-01-01 06:37:43 │ 2023-01-01 06:40:48 │ 2023-01-01 06:36:43 │ … │ [A5, A6, C3] │ 3 │
│ 2023-01-01 06:39:48 │ 2023-01-01 06:46:10 │ 2023-01-01 06:38:48 │ … │ [A6, A5, C3] │ 3 │
├─────────────────────┴─────────────────────┴─────────────────────┴───┴──────────────────┴────────────────────┤
│ 9 rows 6 columns (5 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
&#34;&#34;&#34;</code></pre><p>We clearly see the strengths of DuckDB in how succintly we were able to express this operation. We also find how DuckDB is able to seamlessly integrate with an existing Pandas or Polars pipeline with zero-conversion costs. In fact, we can convert this back to a Polars or Pandas dataframe by appending the ending bracket with <code>db.query(...).pl()</code> and <code>db.query(...).pd()</code> respectively.</p>
<h2 id="can-we-make-the-sql-simpler">Can we make the SQL simpler?</h2>
<p>Now that we&rsquo;ve understood the logic that goes into the query, let&rsquo;s try to optimize the algorithm. We have the three conditions:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="c1">-- Case 2 in the diagram
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="w"></span><span class="c1">-- Case 3 in the diagram
</span></span></span><span class="line"><span class="ln">5</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">6</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span><span class="w"> </span><span class="k">OR</span><span class="w">
</span></span></span><span class="line"><span class="ln">7</span><span class="cl"><span class="w"></span><span class="c1">-- Case 4 in the diagram
</span></span></span><span class="line"><span class="ln">8</span><span class="cl"><span class="c1"></span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">9</span><span class="cl"><span class="w"> </span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">TO_SECONDS</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">duration</span><span class="p">))</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="p">)</span></span></span></code></pre></div><p>What is common between these three conditions? It takes a while to see it; but it becomes clear that all these cases require the start of the overlap to be <em>before</em> the window ends, and the end of the overlap to be <em>after</em> the window starts. This can be simplified to just:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="w"></span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span></span></span></code></pre></div><p>making our query much simpler!</p>
<h3 id="simplified-sql-part-1">Simplified SQL: Part 1</h3>
<p>We&rsquo;ve removed the need for the <code>duration</code> calculation algother now. Therefore, we can write:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"> </span><span class="k">SELECT</span><span class="w"> </span><span class="o">*</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="w"> </span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="p">)</span><span class="w"> </span><span class="n">A</span><span class="w">
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="w"></span><span class="k">LEFT</span><span class="w"> </span><span class="k">JOIN</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="w">
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="w"></span><span class="k">ON</span><span class="w"> </span><span class="p">(</span><span class="w">
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_close</span><span class="w"> </span><span class="k">AND</span><span class="w">
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="w"></span><span class="p">)</span><span class="w">
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Can we simplify this even further?</p>
<h3 id="simplification-part-2">Simplification: Part 2</h3>
<p>I think the SQL query in the above section is very easy to ready already. However, it is a little clunky overall, and there is a way that we can leverage DuckDB&rsquo;s extensive optimizations to simplify our <strong>legibility</strong> by rewriting the query as a cross join:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">SELECT</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="w"> </span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">A</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="nb">INTERVAL</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="k">MINUTE</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_DISTINCT</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_trucks</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="w"> </span><span class="p">,</span><span class="n">LIST_UNIQUE</span><span class="p">(</span><span class="n">LIST</span><span class="p">(</span><span class="n">B</span><span class="p">.</span><span class="n">ID</span><span class="p">))</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">docked_truck_count</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">A</span><span class="p">,</span><span class="w"> </span><span class="k">data</span><span class="w"> </span><span class="n">B</span><span class="w">
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">arrival_time</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">window_close</span><span class="w">
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="w"></span><span class="k">AND</span><span class="w"> </span><span class="n">B</span><span class="p">.</span><span class="n">departure_time</span><span class="w"> </span><span class="o">&gt;=</span><span class="w"> </span><span class="n">window_open</span><span class="w">
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span></span></span></code></pre></div><p>Why does this work? Before optimization on DuckDB, this is what the query plan looks like:</p>
<!-- raw HTML omitted -->
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ FILTER │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ (arrival_time &lt;= │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│(departure_time + to_m... │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ (departure_time &gt;= │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│(arrival_time - to_min... │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ AS BIGINT)))) │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">│ CROSS_PRODUCT ├──────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">37</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div><!-- raw HTML omitted -->
<p>After optimization, the <code>CROSS_PRODUCT</code> is <strong>automatically</strong> optimized to an <strong>interval join</strong>!</p>
<!-- raw HTML omitted -->
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="s2">┌───────────────────────────┐
</span></span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="s2">│ PROJECTION │
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">│ 0 │
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2">│ 1 │
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2">│ 2 │
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2">│ 3 │
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2">│ docked_trucks │
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2">│ docked_truck_count │
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">│ AGGREGATE │
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">│ arrival_time │
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">│ departure_time │
</span></span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="s2">│ window_open │
</span></span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="s2">│ window_close │
</span></span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="s2">│ list(ID) │
</span></span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="s2">└─────────────┬─────────────┘
</span></span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="s2">│ COMPARISON_JOIN │
</span></span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="s2">│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │
</span></span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="s2">│ INNER │
</span></span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="s2">│ ((departure_time + &#39;00:01 │
</span></span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="s2">│ :00&#39;::INTERVAL) &gt;= ├──────────────┐
</span></span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="s2">│ arrival_time) │ │
</span></span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="s2">│((arrival_time - &#39;00:01:00&#39;│ │
</span></span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="s2">│ ::INTERVAL) &lt;= │ │
</span></span></span><span class="line"><span class="ln">30</span><span class="cl"><span class="s2">│ departure_time) │ │
</span></span></span><span class="line"><span class="ln">31</span><span class="cl"><span class="s2">└─────────────┬─────────────┘ │
</span></span></span><span class="line"><span class="ln">32</span><span class="cl"><span class="s2">┌─────────────┴─────────────┐┌─────────────┴─────────────┐
</span></span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="s2">│ ARROW_SCAN ││ ARROW_SCAN │
</span></span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="s2">└───────────────────────────┘└───────────────────────────┘
</span></span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span> </span></span></code></pre></div><!-- raw HTML omitted -->
<p>So in effect, we&rsquo;re actually exploiting a feature of DuckDB to allow us to write our queries in a suboptimal manner for greater readability, and allowing the optmizer to do a good chunk of our work for us. I wouldn&rsquo;t recommend using this generally, because not all SQL engine optmizers will be able to find an efficient route to these calculations for large datasets.</p>
<h3 id="how-to-get-query-plans">How to get query plans?</h3>
<p>I&rsquo;m glad you asked. Here&rsquo;s the DuckDB <a href="https://duckdb.org/docs/guides/meta/explain.html">page explaining <code>EXPLAIN</code></a> (heh). Here&rsquo;s the code I used:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">duckdb</span> <span class="k">as</span> <span class="nn">db</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">db</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&#34;SET EXPLAIN_OUTPUT=&#39;all&#39;;&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="nb">print</span><span class="p">(</span><span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="s2">&#34;&#34;&#34;
</span></span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="s2">EXPLAIN
</span></span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="s2">SELECT
</span></span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="s2"> A.arrival_time
</span></span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="s2"> ,A.departure_time
</span></span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="s2"> ,A.arrival_time - (INTERVAL 1 MINUTE) AS window_open
</span></span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="s2"> ,A.departure_time + (INTERVAL 1 MINUTE) AS window_close
</span></span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="s2"> ,LIST_DISTINCT(LIST(B.ID)) AS docked_trucks
</span></span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="s2"> ,LIST_UNIQUE(LIST(B.ID)) AS docked_truck_count
</span></span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="s2">FROM data A, data B
</span></span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="s2">WHERE B.arrival_time &lt;= window_close
</span></span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="s2">AND B.departure_time &gt;= window_open
</span></span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="s2">GROUP BY 1, 2, 3, 4
</span></span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="s2">&#34;&#34;&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">pl</span><span class="p">()[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span></span></span></code></pre></div><h1 id="what-are-the-alternatives">What are the alternatives?</h1>
<h2 id="the-datatable-way">The <code>data.table</code> way</h2>
<p><a href="https://github.com/Rdatatable/data.table"><code>data.table</code></a> is a package that has historically been ahead of its time - in both speed and features that it has had. Developement has taken a hit recently, but will likely <a href="https://github.com/Rdatatable/data.table/issues/5656">pick back up</a>. It&rsquo;s my favourite package on all fronts for data manipulation, but suffers simply from the lack of broader R support across the ML and DL space.</p>
<h3 id="the-foverlaps-function">The <code>foverlaps</code> function</h3>
<p>If this kind of overlapping join is common, shouldn&rsquo;t someone have developed a package for it? Turns out, <code>data.table</code> has, and with very specific constraints that make it the perfect solution to our problem (if you don&rsquo;t mind switching over to R, that is).</p>
<p>The <code>foverlaps</code> function has these requirements:</p>
<ol>
<li>The input <code>data.table</code> objects have to be keyed for automatic recognition of columns.</li>
<li>The default match type is that it matches all three cases from the image above. Side note: it also has matches for <code>within</code> overlap, matching <code>start</code> and <code>end</code> windows,</li>
<li>The last two matching columns in the join condition in <code>by</code> must specify the <code>start</code> and <code>end</code> points of the overlapping window. This isn&rsquo;t a problem for us now, but does restrict for future uses where we may want non-equi joins on other cases.</li>
</ol>
<h3 id="the-code-_si_-the-code">The code, <em>si</em>, the code!</h3>
<p>Without further ado:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">data.table</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="nf">library</span><span class="p">(</span><span class="n">lubridate</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl">
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="n">arrival_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s">&#39;2023-01-01 06:23:47.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:26:42.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s">&#39;2023-01-01 06:30:20.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:32:06.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s">&#39;2023-01-01 06:33:09.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:34:08.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:40.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:37:43.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s">&#39;2023-01-01 06:39:48.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="n">departure_time</span> <span class="o">=</span> <span class="nf">as_datetime</span><span class="p">(</span><span class="nf">c</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s">&#39;2023-01-01 06:25:08.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:28:02.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s">&#39;2023-01-01 06:35:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:33:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s">&#39;2023-01-01 06:36:01.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:39:49.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s">&#39;2023-01-01 06:38:34.000000&#39;</span><span class="p">,</span> <span class="s">&#39;2023-01-01 06:40:48.000000&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s">&#39;2023-01-01 06:46:10.000000&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="n">ID</span> <span class="o">=</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A1&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;B3&#39;</span><span class="p">,</span> <span class="s">&#39;C3&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">,</span> <span class="s">&#39;A5&#39;</span><span class="p">,</span> <span class="s">&#39;A6&#39;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl">
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">DT</span> <span class="o">=</span> <span class="nf">data.table</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">arrival_time</span> <span class="o">=</span> <span class="n">arrival_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">departure_time</span> <span class="o">=</span> <span class="n">departure_time</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="n">ID</span> <span class="o">=</span> <span class="n">ID</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="c1">######### BOILERPLATE CODE, NO LOGIC HERE ####################</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl">
</span></span><span class="line"><span class="ln">25</span><span class="cl"><span class="c1"># A copy(DT) creates a copy of a data.table that isn&#39;t linked</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"><span class="c1"># to the original one, so that changes in it don&#39;t reflect in</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl"><span class="c1"># the original DT object.</span>
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># The `:=` allow assignment by reference (i.e. &#34;in place&#34;).</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="n">DT_with_windows</span> <span class="o">=</span> <span class="nf">copy</span><span class="p">(</span><span class="n">DT</span><span class="p">)</span><span class="n">[</span><span class="p">,</span> <span class="nf">`:=`</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">window_start</span> <span class="o">=</span> <span class="n">arrival_time</span> <span class="o">-</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">window_end</span> <span class="o">=</span> <span class="n">departure_time</span> <span class="o">+</span> <span class="nf">minutes</span><span class="p">(</span><span class="m">1</span><span class="p">))</span><span class="n">]</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl">
</span></span><span class="line"><span class="ln">33</span><span class="cl"><span class="c1"># This step is necessary for the second table, but not the first, but we</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"><span class="c1"># key both data.tables to make the foverlap code very succinct.</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;arrival_time&#34;</span><span class="p">,</span> <span class="s">&#34;departure_time&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl"><span class="nf">setkeyv</span><span class="p">(</span><span class="n">DT_with_windows</span><span class="p">,</span> <span class="nf">c</span><span class="p">(</span><span class="s">&#34;window_start&#34;</span><span class="p">,</span> <span class="s">&#34;window_end&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">37</span><span class="cl">
</span></span><span class="line"><span class="ln">38</span><span class="cl"><span class="c1"># The foverlap function returns a data.table, so we can simply apply</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"><span class="c1"># the usual data.table syntax on it!</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"><span class="c1"># Since we have the same name of some columns in both data.tables,</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"><span class="c1"># the latter table&#39;s columns are prefixed with &#34;i.&#34; to avoid conflicts.</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl"><span class="nf">foverlaps</span><span class="p">(</span><span class="n">DT</span><span class="p">,</span> <span class="n">DT_with_windows</span><span class="p">)</span><span class="n">[</span>
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="p">,</span> <span class="n">.(docked_trucks</span> <span class="o">=</span> <span class="nf">list</span><span class="p">(</span><span class="nf">unique</span><span class="p">(</span><span class="n">i.ID</span><span class="p">)),</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl"> <span class="n">docked_truck_count</span> <span class="o">=</span> <span class="nf">uniqueN</span><span class="p">(</span><span class="n">i.ID</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">45</span><span class="cl"> <span class="p">,</span> <span class="n">.(arrival_time</span><span class="p">,</span> <span class="n">departure_time</span><span class="p">)</span><span class="n">]</span></span></span></code></pre></div><p>provides us the output:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-r" data-lang="r"><span class="line"><span class="ln"> 1</span><span class="cl"> <span class="n">arrival_time</span> <span class="n">departure_time</span> <span class="n">docked_trucks</span> <span class="n">docked_truck_count</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">POSc</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">list</span><span class="o">&gt;</span> <span class="o">&lt;</span><span class="n">int</span><span class="o">&gt;</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="m">1</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">23</span><span class="o">:</span><span class="m">47</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">25</span><span class="o">:</span><span class="m">08</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="m">2</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">26</span><span class="o">:</span><span class="m">42</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">28</span><span class="o">:</span><span class="m">02</span> <span class="n">A1</span> <span class="m">1</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="m">3</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">30</span><span class="o">:</span><span class="m">20</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">35</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="m">4</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">32</span><span class="o">:</span><span class="m">06</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">48</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="m">5</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">33</span><span class="o">:</span><span class="m">09</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">01</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="m">6</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">34</span><span class="o">:</span><span class="m">08</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">49</span> <span class="n">A5</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">B3</span><span class="p">,</span><span class="n">C3</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="m">7</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">36</span><span class="o">:</span><span class="m">40</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">38</span><span class="o">:</span><span class="m">34</span> <span class="n">B3</span><span class="p">,</span><span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">4</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="m">8</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">37</span><span class="o">:</span><span class="m">43</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">40</span><span class="o">:</span><span class="m">48</span> <span class="n">C3</span><span class="p">,</span><span class="n">A6</span><span class="p">,</span><span class="n">A5</span> <span class="m">3</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="m">9</span><span class="o">:</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">39</span><span class="o">:</span><span class="m">48</span> <span class="m">2023-01-01</span> <span class="m">06</span><span class="o">:</span><span class="m">46</span><span class="o">:</span><span class="m">10</span> <span class="n">C3</span><span class="p">,</span><span class="n">A5</span><span class="p">,</span><span class="n">A6</span> <span class="m">3</span></span></span></code></pre></div><h3 id="considerations-for-using-datatable">Considerations for using <code>data.table</code></h3>
<p>The package offers a wonderful, nearly one-stop solution that doesn&rsquo;t require you to write the logic out for the query or command yourself, but has a major problem for a lot of users - it requires you to switch your codebase to R, and a lot of your tasks may be on Python or in an SQL pipeline. So, what do you do?</p>
<p>Consider the effort in maintaining an additional dependency for your analytics pipeline (i.e. R), and the effort that you&rsquo;ll need to invest to run R from Python, or run an R script in your pipeline and pull the output from it back into the pipeline, and make your call.</p>
</content>
<p>
</p>
<p>
<a href='mailto:me@example.com?subject=Reply%20to%20"Overlap%20Joins"'>
Reply to this post by email ↪
</a>
</p>
</main>
<footer><small>
| Made with <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>
</small></footer>
</body>
</html>

View File

@@ -0,0 +1,19 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>projects | Avinash's Blog</title><meta name=title content="projects"><meta name=description content="Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.
Featured projects
BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.
Other work or contributions
IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.
r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.
I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch.
"><meta name=author content><meta name=keywords content><meta property="og:url" content="https://avimallu.dev/projects/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="projects"><meta property="og:description" content="Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.
Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post. Other work or contributions IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run. r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time. I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch."><meta property="og:locale" content="en_US"><meta property="og:type" content="article"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="projects"><meta name=twitter:description content="Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.
Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post. Other work or contributions IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run. r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time. I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch."><meta itemprop=name content="projects"><meta itemprop=description content="Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.
Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post. Other work or contributions IntelligentReceiptSplitter: A relatively simple predecessor to BorrowChecker that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run. r.data.table.funs: A very small set of R functions that use data.table, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time. I wrote several chapters of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like data.table and dplyr dominated), so I was eager to make it better for everybody making the switch."><meta itemprop=wordCount content="276"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><p>Most of my work is on private repositories, but I do find some time to learn new topics, contribute back to some of the open source packages I frequently use, or to create interesting tools.</p><h1 id=featured-projects>Featured projects</h1><ol><li><a href=https://avimallu.github.io/BorrowChecker/>BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href=https://github.com/avimallu/BorrowChecker>Repository link</a>.</li><li><a href=https://github.com/avimallu/PowerPointSnap>PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href=https://avimallu.dev/blog/003_powerpointsnap/>blog post</a>.</li></ol><h1 id=other-work-or-contributions>Other work or contributions</h1><ol><li><a href=https://github.com/avimallu/IntelligentReceiptSplitter>IntelligentReceiptSplitter</a>: A relatively simple predecessor to <a href=https://avimallu.github.io/BorrowChecker/>BorrowChecker</a> that focussed on using an OCR framework followed by an LLM based parser to read receipts that could be further split manually. This combination significantly reduced hallucinations from LLMs but was still very computationally intensive to run.</li><li><a href=https://github.com/avimallu/r.data.table.funs>r.data.table.funs</a>: A very small set of R functions that use <code>data.table</code>, that I found very useful earlier in my career to quicky churn out analyses. It is not ground-breaking, but rather something that anybody with sufficient basic skills in R and understand, and save an immense amount of time.</li><li>I <a href=https://github.com/pola-rs/polars-book/pull/364>wrote</a> <a href=https://github.com/pola-rs/polars-book/pull/358>several</a> <a href=https://github.com/pola-rs/polars-book/pull/365/files>chapters</a> of the Polars Book, which have since been moved to the main Polars repository. Polars was a breadth of fresh air in terms of speed and ergonomics, which I had been sorely missing after switching to Python from R (where projects like <code>data.table</code> and <code>dplyr</code> dominated), so I was eager to make it better for everybody making the switch.</li></ol></content><p></p></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

25
public/projects/index.xml Normal file
View File

@@ -0,0 +1,25 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Projects on Avinash&#39;s Blog</title>
<link>http://localhost:1313/projects/</link>
<description>Recent content in Projects on Avinash&#39;s Blog</description>
<generator>Hugo -- gohugo.io</generator>
<language>en-US</language>
<copyright>© Avinash Mallya</copyright>
<atom:link href="http://localhost:1313/projects/index.xml" rel="self" type="application/rss+xml" />
<item>
<title>projects</title>
<link>http://localhost:1313/projects/projects/</link>
<pubDate>Mon, 01 Jan 0001 00:00:00 +0000</pubDate>
<guid>http://localhost:1313/projects/projects/</guid>
<description>&lt;h1 id=&#34;featured-projects&#34;&gt;Featured projects&lt;/h1&gt;&#xA;&lt;ol&gt;&#xA;&lt;li&gt;&lt;a href=&#34;https://avimallu.github.io/BorrowChecker/&#34;&gt;BorrowChecker&lt;/a&gt;: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. &lt;a href=&#34;https://github.com/avimallu/BorrowChecker&#34;&gt;Repository link&lt;/a&gt;.&lt;/li&gt;&#xA;&lt;li&gt;&lt;a href=&#34;https://github.com/avimallu/PowerPointSnap&#34;&gt;PowerPointSnap&lt;/a&gt;: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying &lt;a href=&#34;https://avimallu.github.io/PowerPointSnap&#34;&gt;blog post&lt;/a&gt;.&lt;/li&gt;&#xA;&lt;/ol&gt;</description>
<content:encoded><![CDATA[<h1 id="featured-projects">Featured projects</h1>
<ol>
<li><a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href="https://github.com/avimallu/BorrowChecker">Repository link</a>.</li>
<li><a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href="https://avimallu.github.io/PowerPointSnap">blog post</a>.</li>
</ol>
]]></content:encoded>
</item>
</channel>
</rss>

View File

@@ -0,0 +1,107 @@
<!DOCTYPE html>
<html lang="en-US">
<head><script src="/livereload.js?mindelay=10&amp;v=2&amp;port=1313&amp;path=livereload" data-no-instant defer></script>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>projects | Avinash&#39;s Blog</title>
<meta name="title" content="projects" />
<meta name="description" content="Featured projects
BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link.
PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.
" />
<meta name="author" content="" />
<meta name="keywords" content="" />
<meta property="og:url" content="http://localhost:1313/projects/projects/">
<meta property="og:site_name" content="Avinash&#39;s Blog">
<meta property="og:title" content="projects">
<meta property="og:description" content="Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.">
<meta property="og:locale" content="en_US">
<meta property="og:type" content="article">
<meta property="article:section" content="projects">
<meta property="og:image" content="http://localhost:1313/static/favicon.ico">
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:image" content="http://localhost:1313/static/favicon.ico">
<meta name="twitter:title" content="projects">
<meta name="twitter:description" content="Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.">
<meta itemprop="name" content="projects">
<meta itemprop="description" content="Featured projects BorrowChecker: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. Repository link. PowerPointSnap: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying blog post.">
<meta itemprop="wordCount" content="81">
<meta itemprop="image" content="http://localhost:1313/static/favicon.ico">
<meta name="referrer" content="no-referrer-when-downgrade" />
<link href="/original.min.css" rel="stylesheet">
</head>
<body>
<header><a class="skip-link" href="#main-content">Skip to main content</a>
<a href="/" class="title"><h1>Avinash&#39;s Blog</h1></a>
<nav>
<a href="/">about</a>
<a href="/blog/">blog</a>
<a href="/projects/projects/">projects</a>
<a href='http://localhost:1313/index.xml'>rss</a>
</nav>
</header>
<main id="main-content">
<content>
<h1 id="featured-projects">Featured projects</h1>
<ol>
<li><a href="https://avimallu.github.io/BorrowChecker/">BorrowChecker</a>: A play on the same concept in Rust, this is a simple web-app that allows you to split complex receipts with multiple people in a simple manner. Runs entirely in-browser. Made with Dioxus and Rust. <a href="https://github.com/avimallu/BorrowChecker">Repository link</a>.</li>
<li><a href="https://github.com/avimallu/PowerPointSnap">PowerPointSnap</a>: A mostly feature complete tool for PowerPoint on VBA that is filled with a lot of tricks to make it easy to consistently format presentations to impress clients - from my consulting days. Written in VBA. See accompanying <a href="https://avimallu.github.io/PowerPointSnap">blog post</a>.</li>
</ol>
</content>
<p>
</p>
</main>
<footer><small>
© Avinash Mallya | Design via <a href="https://github.com/clente/hugo-bearcub">Bear Cub</a>.
</small></footer>
</body>
</html>

3
public/robots.txt Normal file
View File

@@ -0,0 +1,3 @@
User-agent: *
Allow: /
Sitemap: https://avimallu.dev/sitemap.xml

1
public/sitemap.xml Normal file
View File

@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>https://avimallu.dev/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/blog/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/projects/</loc></url><url><loc>https://avimallu.dev/tags/powerpoint/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/ppt/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/blog/003_powerpointsnap/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/vba/</loc><lastmod>2023-10-20T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/approximate/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/category/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/faiss/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/blog/002_representative_samples/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/graph/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/nearest/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/neighbor/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/network/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/networkx/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/polars/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/representative/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/tags/samples/</loc><lastmod>2023-10-19T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/blog/001_overlap_joins/</loc><lastmod>2023-06-22T00:00:00+00:00</lastmod></url><url><loc>https://avimallu.dev/categories/</loc></url></urlset>

1
public/syntax.min.css vendored Normal file
View File

@@ -0,0 +1 @@
.bg{color:#f8f8f2;background-color:#282a36}.chroma{color:#f8f8f2;background-color:#282a36}.chroma .lnlinks{outline:none;text-decoration:none;color:inherit}.chroma .lntd{vertical-align:top;padding:0;margin:0;border:0}.chroma .lntable{border-spacing:0;padding:0;margin:0;border:0}.chroma .hl{background-color:#ffc}.chroma .lnt{white-space:pre;user-select:none;margin-right:.4em;padding:0 .4em;color:#929292}.chroma .ln{white-space:pre;user-select:none;margin-right:.4em;padding:0 .4em;color:#929292}.chroma .line{display:flex}.chroma .k{color:#ff79c6}.chroma .kc{color:#ff79c6}.chroma .kd{color:#8be9fd;font-style:italic}.chroma .kn{color:#ff79c6}.chroma .kp{color:#ff79c6}.chroma .kr{color:#ff79c6}.chroma .kt{color:#8be9fd}.chroma .na{color:#50fa7b}.chroma .nb{color:#8be9fd;font-style:italic}.chroma .nc{color:#50fa7b}.chroma .nf{color:#50fa7b}.chroma .nl{color:#8be9fd;font-style:italic}.chroma .nt{color:#ff79c6}.chroma .nv{color:#8be9fd;font-style:italic}.chroma .vc{color:#8be9fd;font-style:italic}.chroma .vg{color:#8be9fd;font-style:italic}.chroma .vi{color:#8be9fd;font-style:italic}.chroma .s{color:#f1fa8c}.chroma .sa{color:#f1fa8c}.chroma .sb{color:#f1fa8c}.chroma .sc{color:#f1fa8c}.chroma .dl{color:#f1fa8c}.chroma .sd{color:#f1fa8c}.chroma .s2{color:#f1fa8c}.chroma .se{color:#f1fa8c}.chroma .sh{color:#f1fa8c}.chroma .si{color:#f1fa8c}.chroma .sx{color:#f1fa8c}.chroma .sr{color:#f1fa8c}.chroma .s1{color:#f1fa8c}.chroma .ss{color:#f1fa8c}.chroma .m{color:#bd93f9}.chroma .mb{color:#bd93f9}.chroma .mf{color:#bd93f9}.chroma .mh{color:#bd93f9}.chroma .mi{color:#bd93f9}.chroma .il{color:#bd93f9}.chroma .mo{color:#bd93f9}.chroma .o{color:#ff79c6}.chroma .ow{color:#ff79c6}.chroma .c{color:#8491b8}.chroma .ch{color:#8491b8}.chroma .cm{color:#8491b8}.chroma .c1{color:#8491b8}.chroma .cs{color:#8491b8}.chroma .cp{color:#ff79c6}.chroma .cpf{color:#ff79c6}.chroma .gd{color:#f55}.chroma .ge{text-decoration:underline}.chroma .gh{font-weight:700}.chroma .gi{color:#50fa7b;font-weight:700}.chroma .go{color:#44475a}.chroma .gu{font-weight:700}.chroma .gl{text-decoration:underline}

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Approximate | Avinash's Blog</title><meta name=title content="Approximate"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/approximate/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Approximate"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Approximate"><meta itemprop=name content="Approximate"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/approximate/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Approximate"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Approximate on Avinash's Blog</title><link>https://avimallu.dev/tags/approximate/</link><description>Recent content in Approximate on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/approximate/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Category | Avinash's Blog</title><meta name=title content="Category"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/category/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Category"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Category"><meta itemprop=name content="Category"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/category/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Category"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Category on Avinash's Blog</title><link>https://avimallu.dev/tags/category/</link><description>Recent content in Category on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/category/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Faiss | Avinash's Blog</title><meta name=title content="Faiss"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/faiss/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Faiss"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Faiss"><meta itemprop=name content="Faiss"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/faiss/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Faiss"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

383
public/tags/faiss/index.xml Normal file
View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Faiss on Avinash's Blog</title><link>https://avimallu.dev/tags/faiss/</link><description>Recent content in Faiss on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/faiss/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Graph | Avinash's Blog</title><meta name=title content="Graph"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/graph/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Graph"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Graph"><meta itemprop=name content="Graph"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/graph/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Graph"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

383
public/tags/graph/index.xml Normal file
View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Graph on Avinash's Blog</title><link>https://avimallu.dev/tags/graph/</link><description>Recent content in Graph on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/graph/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

19
public/tags/index.html Normal file
View File

@@ -0,0 +1,19 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Tags | Avinash's Blog</title><meta name=title content="Tags"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Tags"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Tags"><meta itemprop=name content="Tags"><meta itemprop=datePublished content="2023-10-20T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-20T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Tags"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/tags/powerpoint/>Powerpoint</a></li><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/tags/ppt/>Ppt</a></li><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/tags/vba/>Vba</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/approximate/>Approximate</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/category/>Category</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/faiss/>Faiss</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/graph/>Graph</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/nearest/>Nearest</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/neighbor/>Neighbor</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/network/>Network</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/networkx/>Networkx</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/polars/>Polars</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/representative/>Representative</a></li><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/tags/samples/>Samples</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

1
public/tags/index.xml Normal file
View File

@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Tags on Avinash's Blog</title><link>https://avimallu.dev/tags/</link><description>Recent content in Tags on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Fri, 20 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/index.xml" rel="self" type="application/rss+xml"/><item><title>Powerpoint</title><link>https://avimallu.dev/tags/powerpoint/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/powerpoint/</guid><description/><content:encoded></content:encoded></item><item><title>Ppt</title><link>https://avimallu.dev/tags/ppt/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/ppt/</guid><description/><content:encoded></content:encoded></item><item><title>Vba</title><link>https://avimallu.dev/tags/vba/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/vba/</guid><description/><content:encoded></content:encoded></item><item><title>Approximate</title><link>https://avimallu.dev/tags/approximate/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/approximate/</guid><description/><content:encoded></content:encoded></item><item><title>Category</title><link>https://avimallu.dev/tags/category/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/category/</guid><description/><content:encoded></content:encoded></item><item><title>Faiss</title><link>https://avimallu.dev/tags/faiss/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/faiss/</guid><description/><content:encoded></content:encoded></item><item><title>Graph</title><link>https://avimallu.dev/tags/graph/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/graph/</guid><description/><content:encoded></content:encoded></item><item><title>Nearest</title><link>https://avimallu.dev/tags/nearest/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/nearest/</guid><description/><content:encoded></content:encoded></item><item><title>Neighbor</title><link>https://avimallu.dev/tags/neighbor/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/neighbor/</guid><description/><content:encoded></content:encoded></item><item><title>Network</title><link>https://avimallu.dev/tags/network/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/network/</guid><description/><content:encoded></content:encoded></item><item><title>Networkx</title><link>https://avimallu.dev/tags/networkx/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/networkx/</guid><description/><content:encoded></content:encoded></item><item><title>Polars</title><link>https://avimallu.dev/tags/polars/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/polars/</guid><description/><content:encoded></content:encoded></item><item><title>Representative</title><link>https://avimallu.dev/tags/representative/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/representative/</guid><description/><content:encoded></content:encoded></item><item><title>Samples</title><link>https://avimallu.dev/tags/samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/tags/samples/</guid><description/><content:encoded></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Nearest | Avinash's Blog</title><meta name=title content="Nearest"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/nearest/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Nearest"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Nearest"><meta itemprop=name content="Nearest"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/nearest/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Nearest"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Nearest on Avinash's Blog</title><link>https://avimallu.dev/tags/nearest/</link><description>Recent content in Nearest on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/nearest/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Neighbor | Avinash's Blog</title><meta name=title content="Neighbor"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/neighbor/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Neighbor"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Neighbor"><meta itemprop=name content="Neighbor"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/neighbor/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Neighbor"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Neighbor on Avinash's Blog</title><link>https://avimallu.dev/tags/neighbor/</link><description>Recent content in Neighbor on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/neighbor/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Network | Avinash's Blog</title><meta name=title content="Network"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/network/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Network"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Network"><meta itemprop=name content="Network"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/network/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Network"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Network on Avinash's Blog</title><link>https://avimallu.dev/tags/network/</link><description>Recent content in Network on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/network/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Networkx | Avinash's Blog</title><meta name=title content="Networkx"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/networkx/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Networkx"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Networkx"><meta itemprop=name content="Networkx"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/networkx/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Networkx"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Networkx on Avinash's Blog</title><link>https://avimallu.dev/tags/networkx/</link><description>Recent content in Networkx on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/networkx/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Polars | Avinash's Blog</title><meta name=title content="Polars"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/polars/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Polars"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Polars"><meta itemprop=name content="Polars"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/polars/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Polars"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Polars on Avinash's Blog</title><link>https://avimallu.dev/tags/polars/</link><description>Recent content in Polars on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/polars/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Powerpoint | Avinash's Blog</title><meta name=title content="Powerpoint"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/powerpoint/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Powerpoint"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Powerpoint"><meta itemprop=name content="Powerpoint"><meta itemprop=datePublished content="2023-10-20T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-20T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/powerpoint/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Powerpoint"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/blog/003_powerpointsnap/>Quick hacks to make client-ready presentations</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,101 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Powerpoint on Avinash's Blog</title><link>https://avimallu.dev/tags/powerpoint/</link><description>Recent content in Powerpoint on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Fri, 20 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/powerpoint/index.xml" rel="self" type="application/rss+xml"/><item><title>Quick hacks to make client-ready presentations</title><link>https://avimallu.dev/blog/003_powerpointsnap/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/003_powerpointsnap/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (&lt;em>decks&lt;/em> in consulting lingo - not even &lt;em>slide decks&lt;/em>). However, it was rather repetitive. Thus, was born PowerPointSnap.&lt;/p>
&lt;h1 id="what-is-it">What is it?&lt;/h1>
&lt;p>I&amp;rsquo;ll write this down as pointers.&lt;/p>
&lt;ol>
&lt;li>It&amp;rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.&lt;/li>
&lt;li>It&amp;rsquo;s Windows only - it&amp;rsquo;s unlikely to work on MacOS.&lt;/li>
&lt;li>It&amp;rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.&lt;/li>
&lt;/ol>
&lt;h1 id="how-do-i-get-it">How do I get it?&lt;/h1>
&lt;p>The project is available on this &lt;a href="https://github.com/avimallu/PowerPointSnap">Github repo&lt;/a>. The instructions to install it are available there, but here&amp;rsquo;s the down-low:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p>
<h1 id="what-is-it">What is it?</h1>
<p>I&rsquo;ll write this down as pointers.</p>
<ol>
<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li>
<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li>
<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li>
</ol>
<h1 id="how-do-i-get-it">How do I get it?</h1>
<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>
<ol>
<li>Download the Snap.ppam file to your system.</li>
<li>Enable the developer options.</li>
<li>Go to the Developer tab, and click on PowerPoint Add-ins.</li>
<li>Click on Add New. Choose the location of the file you just dowloaded. Click Close.</li>
<li>To uninstall, repeat the process, and simply click on Remove this time.</li>
</ol>
<h1 id="what-can-i-do-with-it">What can I do with it?</h1>
<p>Frankly, a LOT. The base concept of this tool is:</p>
<ol>
<li>&ldquo;Set&rdquo; a shape as the one you want to copy a property from.</li>
<li>Select any property from the list to automatically apply it.</li>
</ol>
<p>Here&rsquo;s a non-exhaustive list of all the options available.</p>
<h2 id="apply-properties-of-shapes-directly">Apply properties of shapes directly</h2>
<p>This is the part of the interface that can be used for shapes (which include charts and tables).</p>
<p><img src="/blog/003_powerpointsnap/01_Shapes.png" alt="The UI for copying shape properties"></p>
<p>To use, first select a <em>shape</em> object, click on &ldquo;Set&rdquo;. Then, choose the object you want to <em>Snap</em> its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.</p>
<p>Note that it&rsquo;s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use <em>Snap</em>.</p>
<h2 id="beautify-charts-with-snappable-properties">Beautify charts with <em>Snap</em>pable properties</h2>
<p>Charts are also supported, with dedicated features for it.</p>
<p><img src="/blog/003_powerpointsnap/02_Charts.png" alt="The UI for copying chart properties"></p>
<p>What do these features do? You should be able to hover over the option and get a tooltip that shows what it&rsquo;s capable of, but here&rsquo;s another summary just in case:</p>
<ol>
<li>Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the &ldquo;set&rdquo; chart to the one you&rsquo;ve selected. I couldn&rsquo;t put in just $x$ and $y$ here because Microsoft internally doesn&rsquo;t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn&rsquo;t work well yet for 3D charts.</li>
<li>Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all <em>look</em> exactly the same from a client perspective. But that&rsquo;s usually difficult if you&rsquo;ve already configured the charts a little - which can be remedied with this option!</li>
<li>Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you&rsquo;ve selected with the way it originally is in the &ldquo;set&rdquo; chart. The reason for this feature is simply to avoid going back to <em>Home</em> to click on the <em>Format Painter</em> option again.</li>
<li>Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.</li>
</ol>
<p>The next two options deserve their own section.</p>
<h2 id="customize-the-labels-programmatically">Customize the labels programmatically</h2>
<p>Your immediate senior in a consulting environment would frown at your chart, and then exclaim, &ldquo;I think that&rsquo;s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it&rsquo;s a one time thing!&rdquo;</p>
<p>It&rsquo;s <strong>never</strong> a one time affair. But don&rsquo;t worry, we have this nice feature to help us. If you click on the <em>Customize Label</em> option, you will get this (without the &ldquo;Set&rdquo; option):</p>
<p><img src="/blog/003_powerpointsnap/DataLabelsScreenshot.JPG" alt="The UI for customizing labels."></p>
<p>Never mind the rather unfriendly legend entries. They&rsquo;re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!</p>
<h3 id="screenshots-of-the-chart-snapability">Screenshots of the chart <em>snap</em>ability</h3>
<p>Of course, visuals will do it more justice. For example, look at this image:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_1.png" alt="Theres a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles arent centered."></p>
<p>Here&rsquo;s what you can do:</p>
<ol>
<li>Click on the left chart. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right chart, and then go through the following:
<ol>
<li>In <em>Shapes</em>, click on <em>Dim</em>. This will align the shapes of the chart.</li>
<li>Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.</li>
<li>You&rsquo;ll notice that the chart area doesn&rsquo;t still match, nor does the title.</li>
<li>In <em>Charts</em>, click on <em>Sync Plot Area</em> and <em>Sync Title Area</em>, and watch the magic unfold.</li>
<li>Now, click on the second chart, and click on &ldquo;Set&rdquo;. Let&rsquo;s align the axes of the first chart to the second one.</li>
<li>Click on the first chart, and then in <em>Charts</em>, click <em>Sync Value Axis</em>.</li>
</ol>
</li>
<li>Let&rsquo;s bring that senior&rsquo;s exclamation back into play - (s)he wants you to highlight <em>only</em> Profit labels, and that too every 2 iterations. To do this:
<ol>
<li>Click on <em>Customize Labels</em> after clicking on either chart.</li>
<li>You&rsquo;ll get the screen shown in the previous section. Make sure to adjust the values such that it&rsquo;s exactly like the screenshot there.</li>
<li>Click on &ldquo;Save and Run&rdquo;. This will <em>save</em> the configuration you&rsquo;ve selected, and <em>run</em> it on the chart you&rsquo;ve selected.</li>
<li>Click the other chart. Then, in <em>Charts</em>, click on <em>Rerun Customization</em>.</li>
</ol>
</li>
</ol>
<p>This is what your results should look like:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_2.png" alt="Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…"></p>
<p>Of course, getting those calculations right is a whole different thing that will need some work.</p>
<h2 id="align-table-dimensions">Align table dimensions</h2>
<p>Oftentimes, you have two tables that show similar values&hellip; you know the drill. Here&rsquo;s what you can do in a scenario such as this:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_1.png" alt="Similar data, but vastly different tables."></p>
<p>This is what the <em>Tables</em> section of the tool looks like:</p>
<p><img src="/blog/003_powerpointsnap/03_Tables.png" alt="The UI for Tables"></p>
<p>To align these tables together,</p>
<ol>
<li>Click on the left table. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right table.</li>
<li>Click on <em>Shapes</em>, inside it, <em>Dim</em>. Now the shapes of the table are the same.</li>
<li>In <em>Tables</em>, click on <em>Sync Column Widths</em>. Now the columns are also the same.</li>
<li>If you try to align by rows, it fails because the number of rows are not the same in the two tables.</li>
</ol>
<p>Here&rsquo;s what you&rsquo;ll end up with:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_2.png" alt="Similar data, and similar enough tables."></p>
<p>Pretty neat, eh?</p>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Ppt | Avinash's Blog</title><meta name=title content="Ppt"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/ppt/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Ppt"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Ppt"><meta itemprop=name content="Ppt"><meta itemprop=datePublished content="2023-10-20T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-20T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/ppt/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Ppt"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-20 pubdate>2023-10-20
</time></i></span><a href=/blog/003_powerpointsnap/>Quick hacks to make client-ready presentations</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

101
public/tags/ppt/index.xml Normal file
View File

@@ -0,0 +1,101 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Ppt on Avinash's Blog</title><link>https://avimallu.dev/tags/ppt/</link><description>Recent content in Ppt on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Fri, 20 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/ppt/index.xml" rel="self" type="application/rss+xml"/><item><title>Quick hacks to make client-ready presentations</title><link>https://avimallu.dev/blog/003_powerpointsnap/</link><pubDate>Fri, 20 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/003_powerpointsnap/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (&lt;em>decks&lt;/em> in consulting lingo - not even &lt;em>slide decks&lt;/em>). However, it was rather repetitive. Thus, was born PowerPointSnap.&lt;/p>
&lt;h1 id="what-is-it">What is it?&lt;/h1>
&lt;p>I&amp;rsquo;ll write this down as pointers.&lt;/p>
&lt;ol>
&lt;li>It&amp;rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.&lt;/li>
&lt;li>It&amp;rsquo;s Windows only - it&amp;rsquo;s unlikely to work on MacOS.&lt;/li>
&lt;li>It&amp;rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.&lt;/li>
&lt;/ol>
&lt;h1 id="how-do-i-get-it">How do I get it?&lt;/h1>
&lt;p>The project is available on this &lt;a href="https://github.com/avimallu/PowerPointSnap">Github repo&lt;/a>. The instructions to install it are available there, but here&amp;rsquo;s the down-low:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>When I worked in healthcare consulting, I often spent a LOT of my time creating PowerPoint presentations (<em>decks</em> in consulting lingo - not even <em>slide decks</em>). However, it was rather repetitive. Thus, was born PowerPointSnap.</p>
<h1 id="what-is-it">What is it?</h1>
<p>I&rsquo;ll write this down as pointers.</p>
<ol>
<li>It&rsquo;s a VBA based PowerPoint add-on. Just a set of commands that work well with each other.</li>
<li>It&rsquo;s Windows only - it&rsquo;s unlikely to work on MacOS.</li>
<li>It&rsquo;s installation-free and is not an executable, which makes it perfect for locked-down corporate environments, as long as you have the permission to download files.</li>
</ol>
<h1 id="how-do-i-get-it">How do I get it?</h1>
<p>The project is available on this <a href="https://github.com/avimallu/PowerPointSnap">Github repo</a>. The instructions to install it are available there, but here&rsquo;s the down-low:</p>
<ol>
<li>Download the Snap.ppam file to your system.</li>
<li>Enable the developer options.</li>
<li>Go to the Developer tab, and click on PowerPoint Add-ins.</li>
<li>Click on Add New. Choose the location of the file you just dowloaded. Click Close.</li>
<li>To uninstall, repeat the process, and simply click on Remove this time.</li>
</ol>
<h1 id="what-can-i-do-with-it">What can I do with it?</h1>
<p>Frankly, a LOT. The base concept of this tool is:</p>
<ol>
<li>&ldquo;Set&rdquo; a shape as the one you want to copy a property from.</li>
<li>Select any property from the list to automatically apply it.</li>
</ol>
<p>Here&rsquo;s a non-exhaustive list of all the options available.</p>
<h2 id="apply-properties-of-shapes-directly">Apply properties of shapes directly</h2>
<p>This is the part of the interface that can be used for shapes (which include charts and tables).</p>
<p><img src="/blog/003_powerpointsnap/01_Shapes.png" alt="The UI for copying shape properties"></p>
<p>To use, first select a <em>shape</em> object, click on &ldquo;Set&rdquo;. Then, choose the object you want to <em>Snap</em> its properties to (see how I got the inspiration for the name?). You should be able to copy all compatible properties - if something is not copy-able, the tool will show an error, and then let you exit.</p>
<p>Note that it&rsquo;s probably not to apply a property of a shape to a table - if you want to make the entire table orange, there are probably better built-in ways to do it than to use <em>Snap</em>.</p>
<h2 id="beautify-charts-with-snappable-properties">Beautify charts with <em>Snap</em>pable properties</h2>
<p>Charts are also supported, with dedicated features for it.</p>
<p><img src="/blog/003_powerpointsnap/02_Charts.png" alt="The UI for copying chart properties"></p>
<p>What do these features do? You should be able to hover over the option and get a tooltip that shows what it&rsquo;s capable of, but here&rsquo;s another summary just in case:</p>
<ol>
<li>Sync Value/Date Axis: this will try to align the range, the ticks, the numeric values etc. of the &ldquo;set&rdquo; chart to the one you&rsquo;ve selected. I couldn&rsquo;t put in just $x$ and $y$ here because Microsoft internally doesn&rsquo;t label them that way. Try either of these two options (you can undo!) and see what works best for your chart. This doesn&rsquo;t work well yet for 3D charts.</li>
<li>Sync Plot/Title/Legend: often, you want to centre a title, or make sure that multiple charts that show nearly identical things for different variables all <em>look</em> exactly the same from a client perspective. But that&rsquo;s usually difficult if you&rsquo;ve already configured the charts a little - which can be remedied with this option!</li>
<li>Format Painter: this is simply a helper for the normal format painter to align the formats of the text that you&rsquo;ve selected with the way it originally is in the &ldquo;set&rdquo; chart. The reason for this feature is simply to avoid going back to <em>Home</em> to click on the <em>Format Painter</em> option again.</li>
<li>Reset Axes Scales: in case you messed up somewhere, you can use this to rever to PowerPoint defaults.</li>
</ol>
<p>The next two options deserve their own section.</p>
<h2 id="customize-the-labels-programmatically">Customize the labels programmatically</h2>
<p>Your immediate senior in a consulting environment would frown at your chart, and then exclaim, &ldquo;I think that&rsquo;s too many labels for the data points. Can you show them every two/three/four labels? I know this is manual work, but it&rsquo;s a one time thing!&rdquo;</p>
<p>It&rsquo;s <strong>never</strong> a one time affair. But don&rsquo;t worry, we have this nice feature to help us. If you click on the <em>Customize Label</em> option, you will get this (without the &ldquo;Set&rdquo; option):</p>
<p><img src="/blog/003_powerpointsnap/DataLabelsScreenshot.JPG" alt="The UI for customizing labels."></p>
<p>Never mind the rather unfriendly legend entries. They&rsquo;re just here to demonstrate that you can do the following kinds of whacky abilities with your own chart!</p>
<h3 id="screenshots-of-the-chart-snapability">Screenshots of the chart <em>snap</em>ability</h3>
<p>Of course, visuals will do it more justice. For example, look at this image:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_1.png" alt="Theres a lot wrong with this image. But primarily, the charts are of different sizes, the axes are different, the labels are too clustered, and the titles arent centered."></p>
<p>Here&rsquo;s what you can do:</p>
<ol>
<li>Click on the left chart. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right chart, and then go through the following:
<ol>
<li>In <em>Shapes</em>, click on <em>Dim</em>. This will align the shapes of the chart.</li>
<li>Use the guides that you get while moving the chart to align the positions of the two charts now that their shapes are equal.</li>
<li>You&rsquo;ll notice that the chart area doesn&rsquo;t still match, nor does the title.</li>
<li>In <em>Charts</em>, click on <em>Sync Plot Area</em> and <em>Sync Title Area</em>, and watch the magic unfold.</li>
<li>Now, click on the second chart, and click on &ldquo;Set&rdquo;. Let&rsquo;s align the axes of the first chart to the second one.</li>
<li>Click on the first chart, and then in <em>Charts</em>, click <em>Sync Value Axis</em>.</li>
</ol>
</li>
<li>Let&rsquo;s bring that senior&rsquo;s exclamation back into play - (s)he wants you to highlight <em>only</em> Profit labels, and that too every 2 iterations. To do this:
<ol>
<li>Click on <em>Customize Labels</em> after clicking on either chart.</li>
<li>You&rsquo;ll get the screen shown in the previous section. Make sure to adjust the values such that it&rsquo;s exactly like the screenshot there.</li>
<li>Click on &ldquo;Save and Run&rdquo;. This will <em>save</em> the configuration you&rsquo;ve selected, and <em>run</em> it on the chart you&rsquo;ve selected.</li>
<li>Click the other chart. Then, in <em>Charts</em>, click on <em>Rerun Customization</em>.</li>
</ol>
</li>
</ol>
<p>This is what your results should look like:</p>
<p><img src="/blog/003_powerpointsnap/Revenue_Presentation_2.png" alt="Everything almost consistent. Your senior rests their eyes, and secretly wonder how you managed to do it quickly… maybe they should change some requirements…"></p>
<p>Of course, getting those calculations right is a whole different thing that will need some work.</p>
<h2 id="align-table-dimensions">Align table dimensions</h2>
<p>Oftentimes, you have two tables that show similar values&hellip; you know the drill. Here&rsquo;s what you can do in a scenario such as this:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_1.png" alt="Similar data, but vastly different tables."></p>
<p>This is what the <em>Tables</em> section of the tool looks like:</p>
<p><img src="/blog/003_powerpointsnap/03_Tables.png" alt="The UI for Tables"></p>
<p>To align these tables together,</p>
<ol>
<li>Click on the left table. Press &ldquo;Set&rdquo; in the toolbar for <em>Snap</em>.</li>
<li>Click on the right table.</li>
<li>Click on <em>Shapes</em>, inside it, <em>Dim</em>. Now the shapes of the table are the same.</li>
<li>In <em>Tables</em>, click on <em>Sync Column Widths</em>. Now the columns are also the same.</li>
<li>If you try to align by rows, it fails because the number of rows are not the same in the two tables.</li>
</ol>
<p>Here&rsquo;s what you&rsquo;ll end up with:</p>
<p><img src="/blog/003_powerpointsnap/Table_Presentation_2.png" alt="Similar data, and similar enough tables."></p>
<p>Pretty neat, eh?</p>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Representative | Avinash's Blog</title><meta name=title content="Representative"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/representative/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Representative"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Representative"><meta itemprop=name content="Representative"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/representative/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Representative"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

View File

@@ -0,0 +1,383 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/"><channel><title>Representative on Avinash's Blog</title><link>https://avimallu.dev/tags/representative/</link><description>Recent content in Representative on Avinash's Blog</description><generator>Hugo -- gohugo.io</generator><language>en-US</language><copyright>© Avinash Mallya</copyright><lastBuildDate>Thu, 19 Oct 2023 00:00:00 +0000</lastBuildDate><atom:link href="https://avimallu.dev/tags/representative/index.xml" rel="self" type="application/rss+xml"/><item><title>Finding representative samples efficiently for large datasets</title><link>https://avimallu.dev/blog/002_representative_samples/</link><pubDate>Thu, 19 Oct 2023 00:00:00 +0000</pubDate><guid>https://avimallu.dev/blog/002_representative_samples/</guid><description>&lt;h1 id="premise">Premise&lt;/h1>
&lt;p>In this day and age, we&amp;rsquo;re not short on data. &lt;em>Good&lt;/em> data, on the other hand, is very valuable. When you&amp;rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.&lt;/p>
&lt;p>Let&amp;rsquo;s formalize the problem a little so that a proper approach can be developed. Here&amp;rsquo;s the problem statement:&lt;/p>
&lt;ol>
&lt;li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.&lt;/li>
&lt;li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.&lt;/li>
&lt;li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.&lt;/li>
&lt;/ol>
&lt;h2 id="in-a-hurry">In a hurry?&lt;/h2>
&lt;p>Here&amp;rsquo;s what you need to do:&lt;/p></description><content:encoded><![CDATA[<h1 id="premise">Premise</h1>
<p>In this day and age, we&rsquo;re not short on data. <em>Good</em> data, on the other hand, is very valuable. When you&rsquo;ve got a large amount of improperly labelled data, it may become hard to find to find a representative dataset to train a model on such that it generalizes well.</p>
<p>Let&rsquo;s formalize the problem a little so that a proper approach can be developed. Here&rsquo;s the problem statement:</p>
<ol>
<li>You have a large-ish set of (imperfectly) labelled data points. These data points can be represented as a 2D matrix.</li>
<li>You need to train a model to classify these data points on either these labels, or on labels dervied from imperfect labels.</li>
<li>You need a good (but not perfect) representative sample for the model to be generalizable, but there are too many data points for each label to manually pick representative examples.</li>
</ol>
<h2 id="in-a-hurry">In a hurry?</h2>
<p>Here&rsquo;s what you need to do:</p>
<ol>
<li>Read the premise and see if it fits your problem.</li>
<li>Go to the <strong>For the folks in a hurry!</strong> section at the end to find the generic solution and how it works.</li>
</ol>
<h2 id="why-do-we-need-representative-samples">Why do we need representative samples?</h2>
<p>Generally, three things come to mind:</p>
<ol>
<li>Allows the model to be generalizable for all <em>kinds</em> of data points <em>within</em> a category.</li>
<li>Allows for faster training of the model - you need <em>fewer</em> data points to get the same accuracy!</li>
<li>Allows maintaining the training set - if your training set needs validation by experts or annotations, this keeps your costs low!</li>
</ol>
<h1 id="define-the-data">Define the data</h1>
<p>This data can be practically anything that can be represented as a 2D matrix.</p>
<p>There are exceptions. Raw image data (as numbers) might get difficult because even if you flatten them, they&rsquo;ll be significant correlation between features. For example, a face can appear practically anywhere in the image, and all pixels centered around the face will be highly correlated, even if they are on different lines. A workaround in this case would be to pipe the image through a CNN model that has been trained on some <em>generic</em> task and produces a 1D representation of a single image in the final hidden layer before the output. Other data will need further processing along similar lines.</p>
<h2 id="get-a-specific-dataset">Get a specific dataset</h2>
<p>For this specific article, I will use the <a href="https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization/data">ShopMania dataset on Kaggle</a>. I apologize in advance for not using a more easily accessible dataset (you need to sign into Kaggle to download it) - and I&rsquo;m not 100% sure if the GPL allows me to create a copy of the data and place it in my own repository. Nevertheless, the data (if you download it and choose to use it instead of some other dataset) will look like this:</p>
<blockquote>
<p><strong>NOTE</strong>: whenever I want to show an output <em>along</em> with the code I used for it, you&rsquo;ll see the characters <code>&gt;&gt;</code> indicating the command used, and the output to be without those prefixes.</p>
</blockquote>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">data</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">313_705</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="err">┌────────────┬──────────────────────────────────────────────────────┬─────────────┬────────────────┐</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="err">│</span> <span class="n">product_ID</span> <span class="err">┆</span> <span class="n">product_title</span> <span class="err">┆</span> <span class="n">category_ID</span> <span class="err">┆</span> <span class="n">category_label</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="err">│</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">┆</span> <span class="o">---</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="err">│</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">┆</span> <span class="n">i64</span> <span class="err">┆</span> <span class="nb">str</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"><span class="err">╞════════════╪══════════════════════════════════════════════════════╪═════════════╪════════════════╡</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="err">│</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">twilight</span> <span class="n">central</span> <span class="n">park</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"><span class="err">│</span> <span class="mi">3</span> <span class="err">┆</span> <span class="n">fox</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"><span class="err">│</span> <span class="mi">4</span> <span class="err">┆</span> <span class="n">circulo</span> <span class="n">de</span> <span class="n">papel</span> <span class="n">wall</span> <span class="n">art</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"><span class="err">│</span> <span class="mi">5</span> <span class="err">┆</span> <span class="n">hidden</span> <span class="n">path</span> <span class="nb">print</span> <span class="err">┆</span> <span class="mi">2</span> <span class="err">┆</span> <span class="n">Collectibles</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="err">│</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">┆</span> <span class="err">…</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"><span class="err">│</span> <span class="mi">313703</span> <span class="err">┆</span> <span class="n">deago</span> <span class="n">anti</span> <span class="n">fog</span> <span class="n">swimming</span> <span class="n">diving</span> <span class="n">full</span> <span class="n">face</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">surface</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="n">fr</span> <span class="n">gopro</span> <span class="n">black</span> <span class="n">s</span><span class="o">/</span><span class="n">m</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="err">│</span> <span class="mi">313704</span> <span class="err">┆</span> <span class="n">etc</span> <span class="n">buys</span> <span class="n">full</span> <span class="n">face</span> <span class="n">gopro</span> <span class="n">compatible</span> <span class="n">snorkel</span> <span class="n">scuba</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">diving</span> <span class="n">mask</span> <span class="n">blue</span> <span class="n">large</span><span class="o">/</span><span class="n">xtralarge</span> <span class="n">blue</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="err">│</span> <span class="mi">313705</span> <span class="err">┆</span> <span class="n">men</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="n">mask</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">blue</span> <span class="n">mask</span> <span class="n">only</span> <span class="n">adult</span> <span class="n">men</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"><span class="err">│</span> <span class="mi">313706</span> <span class="err">┆</span> <span class="n">women</span> <span class="mi">039</span> <span class="n">s</span> <span class="n">full</span> <span class="n">face</span> <span class="n">breathe</span> <span class="n">free</span> <span class="n">diving</span> <span class="n">snorkel</span> <span class="err">┆</span> <span class="mi">229</span> <span class="err">┆</span> <span class="n">Water</span> <span class="n">Sports</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">mask</span> <span class="n">scuba</span> <span class="n">optional</span> <span class="n">hd</span> <span class="n">camera</span> <span class="n">black</span> <span class="n">mask</span> <span class="n">only</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"><span class="err">│</span> <span class="err">┆</span> <span class="n">children</span> <span class="ow">and</span> <span class="n">women</span> <span class="err">┆</span> <span class="err">┆</span> <span class="err">│</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"><span class="err">└────────────┴──────────────────────────────────────────────────────┴─────────────┴────────────────┘</span></span></span></code></pre></div><p>The data documentation on Kaggle states:</p>
<blockquote>
<p>The first dataset originates from ShopMania, a popular online product comparison platform. It enlists tens of millions of products organized in a three-level hierarchy that includes 230 categories. The two higher levels of the hierarchy include 39 categories, whereas the third lower level accommodates the rest 191 leaf categories. Each product is categorized into this tree structure by being mapped to only one leaf category. Some of these 191 leaf categories contain millions of products. However, shopmania.com allows only the first 10,000 products to be retrieved from each category. Under this restriction, our crawler managed to collect 313,706 products.</p>
</blockquote>
<p>For demonstration, I&rsquo;ll just limit the categories to those that have exactly 10,000 occurences.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="p">)</span></span></span></code></pre></div><p>You&rsquo;ll notice that there are only 17 categories in this dataset. Run this to verify that fact.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">17</span><span class="p">,)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="n">Series</span><span class="p">:</span> <span class="s1">&#39;category_label&#39;</span> <span class="p">[</span><span class="nb">str</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s2">&#34;Kitchen &amp; Dining&#34;</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s2">&#34;Scarves and wraps&#34;</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s2">&#34;Handbags &amp; Wallets&#34;</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="s2">&#34;Rugs Tapestry &amp; Linens&#34;</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;Cell Phones Accessories&#34;</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s2">&#34;Men&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;Jewelry&#34;</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s2">&#34;Belts&#34;</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s2">&#34;Men Lingerie&#34;</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s2">&#34;Crafts&#34;</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="s2">&#34;Football&#34;</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="s2">&#34;Medical Supplies&#34;</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="s2">&#34;Adult&#34;</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="s2">&#34;Hunting&#34;</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="s2">&#34;Women&#39;s Clothing&#34;</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="s2">&#34;Pet Supply&#34;</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="s2">&#34;Office Supplies&#34;</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"><span class="p">]</span></span></span></code></pre></div><p>Note that this is very easy in Polars, which is the package I typically use for data manipulation. I recommend using it over Pandas.</p>
<h2 id="specify-the-task">Specify the task</h2>
<p>Okay - so now we have exactly 10,000 products <em>per</em> category. We only have the title of the product that can be leveraged for categorization. So let me define the task this way:</p>
<blockquote>
<p>Craft a <em>small</em> representative sample for each category.</p>
</blockquote>
<p>Why small? It helps that it&rsquo;ll make the model faster to train - <em>and</em> keep the training data manageable in size.</p>
<h1 id="finding-representative-samples">Finding representative samples</h1>
<p>I mentioned earlier that we need to represent data as a 2D matrix for the technique I have in mind to work. How can I translate a list of text to a matrix? The answer&rsquo;s rather simple: use <code>SentenceTransformers</code> to get a string&rsquo;s embedding. You could also use more classic techniques like computing TF-IDF values, or use more advanced transformers, but I&rsquo;ve noticed that <code>SentenceTransformers</code> are able to capture semantic meaning of sentences rather well (assuming you use a good model suited for the language the data is in) - they are trained on sentence similarity after all.</p>
<h2 id="getting-sentencetransformer-embeddings">Getting <code>SentenceTransformer</code> embeddings</h2>
<p>This part is rather simple. If you&rsquo;re unable to install SentenceTransformers, <a href="https://www.sbert.net/docs/installation.html">please check their website</a>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="o">.</span><span class="n">numpy</span><span class="p">())</span></span></span></code></pre></div><p>This process will be slow (~30 minutes) if you don&rsquo;t have a GPU. There are faster approaches, but they are slightly more involved than would be beneficial for a blog post. The wait will be worth it, I promise! In addition, the call to <code>.numpy()</code> at the end is to directly get a single <code>numpy</code> array - otherwise you get a <code>list</code> of <code>numpy</code> arrays, which is rather inefficient. Further, <code>SentenceTransformers</code> will try to run on the GPU if available, and if so, you will need to write <code>.cpu().numpy()</code> so that the tensor is copied from the GPU to the CPU.</p>
<blockquote>
<p><strong>NOTE</strong>: for a proof-of-concept implementation, or if you&rsquo;re on the CPU, try the <code>all-MiniLM-L6-v2</code> model. It&rsquo;s a much smaller and much faster model, although you sacrifice a little in terms of accuracy.</p>
</blockquote>
<h2 id="the-concept-of-_approximate_-nearest-neighbors">The concept of <em>approximate</em> nearest neighbors</h2>
<p>Performing any kind of nearest neighbor algorithm on medium scale datasets (even bordering 10,000 rows and tens of columns) tends to be slow. A primary driver of this was the need to calculate all, or nearly all distances between all data points. <em>Approximate</em> nearest neighbor (ANN) algorithms work around this through various approaches, which warrant their own blog post. For now, it would suffice to understand that there are shortcuts that ANN algorithms take to give you if not the exact nearest neighbor, at least <em>one</em> of the nearest neighbors (hence the term <em>approximate</em>).</p>
<p>There are several algorithms that you can use - I shall proceed with <code>faiss</code>, because it has a nice Python interface and is rather easy to work with. You can use any algorithm - a full list of the major ones are <a href="https://github.com/erikbern/ann-benchmarks">available here</a>.</p>
<p>I&rsquo;ll explain why we&rsquo;re in the nearest neighbor territory in due course.</p>
<h3 id="building-the-database">Building the database</h3>
<p>To build the database, all we need is the <code>title_embeddings</code> matrix.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">ann_index</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatL2</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Eucledian Matrix</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="n">ann_index</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">title_embeddings</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl">
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="k">return</span> <span class="n">ann_index</span> <span class="c1"># Faiss considers databases an &#34;index&#34;</span></span></span></code></pre></div><p>This does create <em>a</em> database. But remember, we&rsquo;re trying to find <em>representative samples</em> - which means we need to do this <em>by</em> the category (or label). So let&rsquo;s design a function that sends only the necessary data as that for a particular category, and then create the database. We&rsquo;ll need three pieces of information from this function:</p>
<ol>
<li>The actual <code>faiss</code> database.</li>
<li>The actual subset of data that was used to build this index.</li>
<li>The label indices with respect to the original data that went into the <code>faiss</code> database.</li>
</ol>
<p>(2) and (3) will help us later in rebuilding a &ldquo;network graph&rdquo; that will allow us to reference the original data points.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl">
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl">
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="c1"># Why cosine similarity? It&#39;s easier to specify thresholds - they&#39;ll always be between 0 and 1.4.</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="c1"># If using Eucledian or other distance, we&#39;ll have to spend some time finding a good range</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="c1"># where distances are reasonable. See https://stats.stackexchange.com/a/146279 for details.</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl">
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span></span></span></code></pre></div><h3 id="identifying-the-nearest-neighbors">Identifying the nearest neighbors</h3>
<p>To proceed with getting a representative sample, the next step is to find the nearest neighbors for <strong>all</strong> data points in the database. This isn&rsquo;t too hard - <code>faiss</code> <code>index</code> objects have a built-in <code>search</code> method to find the <code>k</code> nearest neighbors for a given index, along with the (approximate) distance to it. Let&rsquo;s then write a function to get the following information: the label index for whom nearest neighbors are being searched, the indices of said nearest neighbors and the distance between them. In network graph parlance, this kind of data is called an <em>edge list</em> i.e. a list of pair of <em>nodes</em> that are connected, along with any additional information that specifies a property (in this case distance) of the <em>edge</em> that connects these <em>nodes</em>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl">
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">16</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">17</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="p">)</span> </span></span></code></pre></div><h3 id="networkx-and-connected-components">NetworkX and Connected Components</h3>
<p>The next step in the process is to create a network graph using the edge-list. But why?</p>
<p>Remember that we have identified the (k=5) nearest neighbors of <strong>each</strong> data point. Let&rsquo;s say that we have a point A that has a nearest neighbor B. C is <strong>not</strong> a nearest neighbor of A, but it is a nearest neighbor of B. In a network graph, if A and C are sufficiently similar enough to B within a particular <em>minimum thershold</em>, then A will be connected to C through B! Hopefully a small visual below would help.</p>
<p><img src="/blog/002_representative_samples/001_Network_Cluster_1.png" alt="How a network component is formed."></p>
<p>What happens when such a concept is extended for many data points? Not all of them would be connected - because we&rsquo;re applying a <em>minimum</em> threshold that they have to meet. This is the only hueristic part of the rather fast process. Here&rsquo;s one more helpful visual:</p>
<p><img src="/blog/002_representative_samples/002_Network_Cluster_2.png" alt="How a network cluster is formed."></p>
<p>Very starry night-eque vibes here. Let&rsquo;s get to the code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">8</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span></span></span></code></pre></div><h1 id="getting-clusters">Getting clusters</h1>
<p>Now that all the parts of the puzzle are together, let&rsquo;s run it to see what kind of clusters you get for <code>Cell Phone Accessories</code>.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span></span></span></code></pre></div><p>Make sure to configure the following if your results aren&rsquo;t good enough:</p>
<ol>
<li>Relax the <code>min_cosine_distance</code> value if you want <em>bigger</em> clusters.</li>
<li>Increase the number of nearest neighbors if you want <em>more</em> matches.</li>
</ol>
<h2 id="viewing-the-components">Viewing the components</h2>
<p>There will likely be many clusters (you can see how many exactly with <code>len(clusters)</code>). Let&rsquo;s look at a random cluster:</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="o">&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 yellow 72570099&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">3</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 black 72570093&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">4</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 lightblue 72570097&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">5</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 blue 72570095&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">6</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 green 72570101&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">7</span><span class="cl"> <span class="s1">&#39;smartphone lanyard with card slot for any phone up to 6 pink 72570091&#39;</span><span class="p">]</span></span></span></code></pre></div><p>Let&rsquo;s see another cluster that had 172(!) members in my run (the clusters themselves will be stable, but their indices may change in each run owing to some inherent randomness in the process).</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="o">&gt;&gt;&gt;</span> <span class="n">clusters</span><span class="p">[</span><span class="mi">6</span><span class="p">]</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="p">[</span><span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case snowflakes iphone 8/7 op qq z051a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 arrows blue op qq a02 58&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s golden pineapple op qq z089a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s butteryfly delight yellow op qq z029d&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 luck of the irish op qq a01 45&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid white op qq a02 16&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"> <span class="o">...</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 flying arrows white op qq hip 20&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 brides maid pink white op qq a02 17&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case iphone 8/7 anemone flowers white op qq z036a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case mustache iphone 8/7 op qq hip 08&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7 modern clear printed phone case oh snap iphone 8/7 op qq z053a&#39;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"> <span class="s1">&#39;otm essentials iphone 8/7/6s clear printed phone case single iphone 8/7/6s desert cacti orange pink op qq a02 22&#39;</span><span class="p">]</span></span></span></code></pre></div><h2 id="running-for-all-categories">Running for all categories</h2>
<p>This isn&rsquo;t that hard (although it may take more than a moment). Just iterate it for each category!</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln">1</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h1 id="for-the-folks-in-a-hurry">For the folks in a hurry!</h1>
<p>I get it - you often want a solution that &ldquo;just works&rdquo;. I can come close to it. See below for code and a succinct explanation. For those of my readers who aren&rsquo;t in a hurry, this also serves as a nice summary (and copy-pastable code)!</p>
<h2 id="the-code">The code</h2>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-py" data-lang="py"><span class="line"><span class="ln"> 1</span><span class="cl"><span class="kn">import</span> <span class="nn">sentence_transformers</span>
</span></span><span class="line"><span class="ln"> 2</span><span class="cl"><span class="kn">import</span> <span class="nn">faiss</span>
</span></span><span class="line"><span class="ln"> 3</span><span class="cl"><span class="kn">import</span> <span class="nn">polars</span> <span class="k">as</span> <span class="nn">pl</span>
</span></span><span class="line"><span class="ln"> 4</span><span class="cl"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</span></span><span class="line"><span class="ln"> 5</span><span class="cl">
</span></span><span class="line"><span class="ln"> 6</span><span class="cl"><span class="c1"># Data is read here. You download the files from Kaggle here: </span>
</span></span><span class="line"><span class="ln"> 7</span><span class="cl"><span class="c1"># https://www.kaggle.com/datasets/lakritidis/product-classification-and-categorization</span>
</span></span><span class="line"><span class="ln"> 8</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&#34;archive/shopmania.csv&#34;</span><span class="p">,</span> <span class="n">new_columns</span><span class="o">=</span><span class="p">[</span>
</span></span><span class="line"><span class="ln"> 9</span><span class="cl"> <span class="s2">&#34;product_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">,</span> <span class="s2">&#34;category_ID&#34;</span><span class="p">,</span> <span class="s2">&#34;category_label&#34;</span><span class="p">])</span>
</span></span><span class="line"><span class="ln">10</span><span class="cl"><span class="n">data</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">11</span><span class="cl"> <span class="n">data</span>
</span></span><span class="line"><span class="ln">12</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="s2">&#34;category_ID&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">10000</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">13</span><span class="cl"> <span class="o">.</span><span class="n">with_row_count</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">14</span><span class="cl"><span class="p">)</span>
</span></span><span class="line"><span class="ln">15</span><span class="cl">
</span></span><span class="line"><span class="ln">16</span><span class="cl">
</span></span><span class="line"><span class="ln">17</span><span class="cl"><span class="c1"># See list of models at www.sbert.net/docs/pretrained_models.html</span>
</span></span><span class="line"><span class="ln">18</span><span class="cl"><span class="n">ST</span> <span class="o">=</span> <span class="n">sentence_transformers</span><span class="o">.</span><span class="n">SentenceTransformer</span><span class="p">(</span><span class="s2">&#34;all-mpnet-base-v2&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">19</span><span class="cl"><span class="n">title_embeddings</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">20</span><span class="cl"> <span class="n">ST</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">21</span><span class="cl"> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">to_list</span><span class="p">(),</span>
</span></span><span class="line"><span class="ln">22</span><span class="cl"> <span class="c1"># I&#39;m on a MacBook, you should use `cuda` or `cpu`</span>
</span></span><span class="line"><span class="ln">23</span><span class="cl"> <span class="c1"># if you&#39;ve got different hardware.</span>
</span></span><span class="line"><span class="ln">24</span><span class="cl"> <span class="n">device</span><span class="o">=</span><span class="s2">&#34;mps&#34;</span><span class="p">,</span>
</span></span><span class="line"><span class="ln">25</span><span class="cl"> <span class="n">show_progress_bar</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">convert_to_tensor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">26</span><span class="cl"> <span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
</span></span><span class="line"><span class="ln">27</span><span class="cl">
</span></span><span class="line"><span class="ln">28</span><span class="cl"><span class="c1"># Code to create a FAISS index</span>
</span></span><span class="line"><span class="ln">29</span><span class="cl"><span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">30</span><span class="cl"> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">31</span><span class="cl"> <span class="n">data</span> <span class="c1"># this needs to be an argument if you want to create a generic function</span>
</span></span><span class="line"><span class="ln">32</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span> <span class="o">==</span> <span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">33</span><span class="cl"> <span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">34</span><span class="cl"> <span class="o">.</span><span class="n">to_list</span><span class="p">()</span>
</span></span><span class="line"><span class="ln">35</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">36</span><span class="cl">
</span></span><span class="line"><span class="ln">37</span><span class="cl"> <span class="n">faiss_data</span> <span class="o">=</span> <span class="n">title_embeddings</span><span class="p">[</span><span class="n">faiss_indices</span><span class="p">]</span>
</span></span><span class="line"><span class="ln">38</span><span class="cl"> <span class="n">d</span> <span class="o">=</span> <span class="n">faiss_data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Number of dimensions</span>
</span></span><span class="line"><span class="ln">39</span><span class="cl"> <span class="n">faiss_DB</span> <span class="o">=</span> <span class="n">faiss</span><span class="o">.</span><span class="n">IndexFlatIP</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="c1"># Index using Inner Product</span>
</span></span><span class="line"><span class="ln">40</span><span class="cl"> <span class="n">faiss</span><span class="o">.</span><span class="n">normalize_L2</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Normalized L2 with Inner Product search = cosine similarity</span>
</span></span><span class="line"><span class="ln">41</span><span class="cl"> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">)</span> <span class="c1"># Build the index</span>
</span></span><span class="line"><span class="ln">42</span><span class="cl">
</span></span><span class="line"><span class="ln">43</span><span class="cl"> <span class="k">return</span> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span>
</span></span><span class="line"><span class="ln">44</span><span class="cl">
</span></span><span class="line"><span class="ln">45</span><span class="cl"><span class="c1"># Code to create an edge-list</span>
</span></span><span class="line"><span class="ln">46</span><span class="cl"><span class="k">def</span> <span class="nf">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">47</span><span class="cl"> <span class="n">faiss_DB</span><span class="p">,</span> <span class="n">faiss_data</span><span class="p">,</span> <span class="n">faiss_indices</span> <span class="o">=</span> <span class="n">create_index</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">48</span><span class="cl"> <span class="c1"># To map the data back to the original `train[b&#39;data&#39;]` array</span>
</span></span><span class="line"><span class="ln">49</span><span class="cl"> <span class="n">faiss_indices_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">faiss_indices</span><span class="p">)}</span>
</span></span><span class="line"><span class="ln">50</span><span class="cl"> <span class="c1"># To map the indices back to the original strings</span>
</span></span><span class="line"><span class="ln">51</span><span class="cl"> <span class="n">title_name_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="n">x</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&#34;row_idx&#34;</span><span class="p">,</span> <span class="s2">&#34;product_title&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">rows</span><span class="p">()}</span>
</span></span><span class="line"><span class="ln">52</span><span class="cl"> <span class="n">distances</span><span class="p">,</span> <span class="n">neighbors</span> <span class="o">=</span> <span class="n">faiss_DB</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">faiss_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">53</span><span class="cl">
</span></span><span class="line"><span class="ln">54</span><span class="cl"> <span class="k">return</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">55</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
</span></span><span class="line"><span class="ln">56</span><span class="cl"> <span class="s2">&#34;from&#34;</span><span class="p">:</span> <span class="n">faiss_indices</span><span class="p">})</span>
</span></span><span class="line"><span class="ln">57</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">58</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="n">neighbors</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">59</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">,</span> <span class="n">distances</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">60</span><span class="cl"> <span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">,</span> <span class="s2">&#34;distance&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">61</span><span class="cl"> <span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
</span></span><span class="line"><span class="ln">62</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">63</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">),</span>
</span></span><span class="line"><span class="ln">64</span><span class="cl"> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">65</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">faiss_indices_map</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">66</span><span class="cl"> <span class="o">.</span><span class="n">map_dict</span><span class="p">(</span><span class="n">title_name_map</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">67</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;from&#34;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;to&#34;</span><span class="p">))</span>
</span></span><span class="line"><span class="ln">68</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">69</span><span class="cl">
</span></span><span class="line"><span class="ln">70</span><span class="cl"><span class="c1"># Code to extract components from a Network Graph</span>
</span></span><span class="line"><span class="ln">71</span><span class="cl"><span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
</span></span><span class="line"><span class="ln">72</span><span class="cl"><span class="k">def</span> <span class="nf">get_cluster_map</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">min_cosine_distance</span><span class="o">=</span><span class="mf">0.95</span><span class="p">):</span>
</span></span><span class="line"><span class="ln">73</span><span class="cl"> <span class="n">edge_list</span> <span class="o">=</span> <span class="p">(</span>
</span></span><span class="line"><span class="ln">74</span><span class="cl"> <span class="n">get_edge_list</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">75</span><span class="cl"> <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">pl</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&#34;distance&#34;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">min_cosine_distance</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">76</span><span class="cl"> <span class="p">)</span>
</span></span><span class="line"><span class="ln">77</span><span class="cl"> <span class="n">graph</span> <span class="o">=</span> <span class="n">nx</span><span class="o">.</span><span class="n">from_pandas_edgelist</span><span class="p">(</span><span class="n">edge_list</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">(),</span> <span class="n">source</span><span class="o">=</span><span class="s2">&#34;from&#34;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&#34;to&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">78</span><span class="cl"> <span class="k">return</span> <span class="p">{</span><span class="n">i</span><span class="p">:</span> <span class="nb">list</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span><span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">nx</span><span class="o">.</span><span class="n">connected_components</span><span class="p">(</span><span class="n">graph</span><span class="p">))}</span>
</span></span><span class="line"><span class="ln">79</span><span class="cl">
</span></span><span class="line"><span class="ln">80</span><span class="cl"><span class="c1"># Example call to a single category to obtain its clusters</span>
</span></span><span class="line"><span class="ln">81</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="n">get_cluster_map</span><span class="p">(</span><span class="s2">&#34;Cell Phones Accessories&#34;</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span>
</span></span><span class="line"><span class="ln">82</span><span class="cl"><span class="c1"># Example call to **all** categories to obtain all clusters</span>
</span></span><span class="line"><span class="ln">83</span><span class="cl"><span class="n">clusters</span> <span class="o">=</span> <span class="p">[</span><span class="n">get_cluster_map</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mf">0.95</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="o">.</span><span class="n">get_column</span><span class="p">(</span><span class="s2">&#34;category_label&#34;</span><span class="p">)</span><span class="o">.</span><span class="n">unique</span><span class="p">()]</span></span></span></code></pre></div><h2 id="how-the-code-works">How the code works</h2>
<p>If you want to write down an algorithmic way of looking at this approach,</p>
<ol>
<li>Obtain a 2D representation of the labelled/categorized data. This can be embeddings for strings, the final hidden state output from a generic CNN model for images, or a good ol&rsquo; tabular dataset where all numbers are normalized and can be expressed as such.</li>
<li>Create an ANN database (based on a package such as <code>faiss</code>) that allows you fast nearest neighbor searches. Use cosine similarity for an easy threshold determination step.</li>
<li>Obtain an edge-list of k (from 5 to 100) nearest neighbors for <strong>all</strong> (or a sample of data points in case your dataset is incredibly HUGE) data points in the ANN database.</li>
<li>Apply a minimum threshold on similarity (completely based on heuristics), and obtain the connected components of the network graph from the filtered edge-list you just created.</li>
<li>Map all indices back to their source data-points that make sense, and pick any number of items from each cluster (usually, I end up picking one element from each cluster), and you now have your representative sample!</li>
</ol>
]]></content:encoded></item></channel></rss>

View File

@@ -0,0 +1,6 @@
<!doctype html><html lang=en-US><head><meta http-equiv=X-Clacks-Overhead content="GNU Terry Pratchett"><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>Samples | Avinash's Blog</title><meta name=title content="Samples"><meta name=description content><meta name=author content><meta name=keywords content="approximate,category,faiss,graph,nearest,neighbor,network,networkx,polars,powerpoint,ppt,representative,samples,vba,"><meta property="og:url" content="https://avimallu.dev/tags/samples/"><meta property="og:site_name" content="Avinash's Blog"><meta property="og:title" content="Samples"><meta property="og:locale" content="en_US"><meta property="og:type" content="website"><meta property="og:image" content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://avimallu.dev/static/favicon.ico"><meta name=twitter:title content="Samples"><meta itemprop=name content="Samples"><meta itemprop=datePublished content="2023-10-19T00:00:00+00:00"><meta itemprop=dateModified content="2023-10-19T00:00:00+00:00"><meta itemprop=image content="https://avimallu.dev/static/favicon.ico"><meta name=referrer content="no-referrer-when-downgrade"><link href=/original.min.css rel=stylesheet><link rel=alternate type=application/rss+xml href=https://avimallu.dev/tags/samples/index.xml title="Avinash's Blog"></head><body><header><a class=skip-link href=#main-content>Skip to main content</a>
<a href=/ class=title><h1>Avinash's Blog</h1></a><nav><a href=/>about</a>
<a href=/blog/>blog</a>
<a href=/projects/>projects</a>
<a href=https://avimallu.dev/index.xml>rss</a></nav></header><main id=main-content><content><h3 class=blog-filter>Filtering for "Samples"</h3><ul class=blog-posts><li><span><i><time datetime=2023-10-19 pubdate>2023-10-19
</time></i></span><a href=/blog/002_representative_samples/>Finding representative samples efficiently for large datasets</a></li></ul></content></main><footer><small>© Avinash Mallya | Design via <a href=https://github.com/clente/hugo-bearcub>Bear Cub</a>.</small></footer></body></html>

Some files were not shown because too many files have changed in this diff Show More