From bb9a0ed907417b2543eeedc592c678a8831d1c6f Mon Sep 17 00:00:00 2001 From: Jason Swank Date: Fri, 19 Jul 2024 21:55:07 -0400 Subject: [PATCH] initial commit --- db.sql | 63 ++++++++++++++++ dump-sql.py | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++ notes.md | 20 +++++ 3 files changed, 289 insertions(+) create mode 100644 db.sql create mode 100755 dump-sql.py create mode 100644 notes.md diff --git a/db.sql b/db.sql new file mode 100644 index 0000000..0bd353b --- /dev/null +++ b/db.sql @@ -0,0 +1,63 @@ +CREATE TABLE events ( + id uuid DEFAULT uuid_generate_v4() NOT NULL, + title character varying(255) NOT NULL, + slug character varying(255) NOT NULL, + multi_day boolean, + image character varying(255), + social_image character varying(255), + venue_id uuid, + admission_fee character varying(255), + organizer_contact character varying(255), + brief_description text, + description text, + links character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + website_link text, + ticket_link text, + fb_event_link character varying(255), + eventbrite_link character varying(255), + bitly_link character varying(255), + tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + verified boolean DEFAULT false NOT NULL, + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + reviewed_by_org character varying, + accessibility character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + category character varying(255), + condition character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + mode character varying(255) +); + + +CREATE TABLE datetime_venue ( + id uuid NOT NULL, + event_id uuid NOT NULL, + venue_id uuid, + start_time timestamp with time zone NOT NULL, + end_time timestamp with time zone NOT NULL, + optional_title character varying(255), + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + timezone character varying(255) DEFAULT 'US/Eastern'::character varying NOT NULL, + category character varying(255) +); + + +CREATE TABLE venues ( + id uuid DEFAULT uuid_generate_v4() NOT NULL, + name character varying(255), + slug character varying(255), + address character varying(255), + g_map_link character varying(255), + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + is_soft_deleted boolean DEFAULT false NOT NULL, + gps_lat double precision, + gps_long double precision, + gps_alt double precision, + street character varying(255) DEFAULT NULL::character varying, + city character varying(255) DEFAULT NULL::character varying, + state character varying(255) DEFAULT NULL::character varying, + zip character varying(255) DEFAULT NULL::character varying, + neighborhood character varying(255) DEFAULT NULL::character varying +); + diff --git a/dump-sql.py b/dump-sql.py new file mode 100755 index 0000000..498d0e2 --- /dev/null +++ b/dump-sql.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# Run Postegres SQL statement which combines information from three tables into +# a single set of results. + +#!/usr/bin/env python3 + +from sqlalchemy import create_engine +import pandas as pd +import os + +pguser = os.getenv('PGUSER', 'postgres') +pgpassword = os.getenv('PGPASSWORD', 'xxx') +pghost = os.getenv('PGHOST', 'db') +pgport = os.getenv('PGPORT', '5432') +pgdatabase = os.getenv('PGDATABASE', 'infinite-api') + +# Connect to the Postgres database +engine = create_engine(f"postgresql+psycopg2://{pguser}:{pgpassword}@{pghost}:{pgport}/{pgdatabase}") + +# The SQL statement to combine information from three tables is: +sql=""" + SELECT + datetime_venue.id as datetime_venue_id, + datetime_venue.event_id as datetime_venue_event_id, + datetime_venue.venue_id as datetime_venue_venue_id, + datetime_venue.start_time as datetime_venue_start_time, + datetime_venue.end_time as datetime_venue_end_time, + datetime_venue.optional_title as datetime_venue_optional_title, + datetime_venue."createdAt" as datetime_venue_createdAt, + datetime_venue."updatedAt" as datetime_venue_updatedAt, + datetime_venue.timezone as datetime_venue_timezone, + datetime_venue.category as datetime_venue_category, + events.id as events_id, + events.title as events_title, + events.slug as events_slug, + events.multi_day as events_multi_day, + events.image as events_image, + events.social_image as events_social_image, + events.venue_id as events_venue_id, + events.admission_fee as events_admission_fee, + events.organizer_contact as events_organizer_contact, + events.brief_description as events_brief_description, + events.description as events_description, + events.links as events_links, + events.website_link as events_website_link, + events.ticket_link as events_ticket_link, + events.fb_event_link as events_fb_event_link, + events.eventbrite_link as events_eventbrite_link, + events.bitly_link as events_bitly_link, + events.tags as events_tags, + events.verified as events_verified, + events."createdAt" as events_createdAt, + events."updatedAt" as events_updatedAt, + events.reviewed_by_org as events_reviewed_by_org, + events.accessibility as events_accessibility, + events.category as events_category, + events.condition as events_condition, + events.mode as events_mode, + venues.id as venues_id, + venues.name as venues_name, + venues.slug as venues_slug, + venues.address as venues_address, + venues.g_map_link as venues_g_map_link, + venues."createdAt" as venues_createdAt, + venues."updatedAt" as venues_updatedAt, + venues.is_soft_deleted as venues_is_soft_deleted, + venues.gps_lat as venues_gps_lat, + venues.gps_long as venues_gps_long, + venues.gps_alt as venues_gps_alt, + venues.street as venues_street, + venues.city as venues_city, + venues.state as venues_state, + venues.zip as venues_zip, + venues.neighborhood as venues_neighborhood + FROM + datetime_venue + JOIN + events + ON datetime_venue.event_id = events.id + JOIN + venues + ON datetime_venue.venue_id = venues.id + WHERE + datetime_venue.start_time >= '2022-01-01' + ORDER BY datetime_venue_start_time DESC; +""" + +df = pd.read_sql(sql, engine) + +# list all the columns in the dataframe +print(df.columns) + +# +# Update the 'events_multi_day' column to indicate whether an event spans multiple days +# +# Group the dataframe by 'datetime_venue_event_id' +grouped = df.groupby('datetime_venue_event_id') + +# Iterate over each group +for event_id, group in grouped: + if len(group) > 1: + # Update 'events_multi_day' column for events with multiple rows + df.loc[df['datetime_venue_event_id'] == event_id, 'events_multi_day'] = True + + + +# Drop duplicate columns like venue_id, event_id, etc. +# These columns are not needed since we have the unique identifier 'datetime_venue_id' for each row. +df = df.drop(columns=['datetime_venue_event_id', 'datetime_venue_venue_id', 'events_venue_id', 'datetime_venue_updatedat', 'events_updatedat', 'venues_updatedat', 'datetime_venue_createdat', 'events_createdat', 'venues_createdat']) + +# Print the first few rows of the dataframe +print(df.head()) + +# Save the dataframe to a CSV file +df.to_csv('events.csv', index=False, encoding='utf-8') + +# SQL statement to combine information from three tables: +# datetime_venue has a row for each event date and venue +# events has a row containing metadata for each event +# venues has a row containing metadata for each venue +# +# an example SQL query is: +# +# SELECT datetime_venue.start_time, datetime_venue.end_time, events.title, venues.name FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id; +# +# The query we will return include all columns from the three tables. +# +# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id; +# +# We only care about events since the start of 2022. +# +# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01'; +# +# Since this data will be imported elsewhere, we want to be sure that each column is named uniquely. +# We will prefix each column with the table name. +# +# SELECT datetime_venue.id as datetime_venue_id,datetime_venue.event_id as datetime_venue_event_id, datetime_venue.venue_id as datetime_venue_venue_id, datetime_venue.start_time as datetime_venue_start_time, datetime_venue.end_time as datetime_venue_end_time, datetime_venue.optional_title as datetime_venue_optional_title, datetime_venue."createdAt" as datetime_venue_createdAt, datetime_venue."updatedAt" as datetime_venue_updatedAt, datetime_venue.timezone as datetime_venue_timezone, datetime_venue.category as datetime_venue_category, events.id as events_id, events.title as events_title, events.slug as events_slug, events.multi_day as events_multi_day, events.image as events_image, events.social_image as events_social_image, events.venue_id as events_venue_id, events.admission_fee as events_admission_fee, events.organizer_contact as events_organizer_contact, events.brief_description as events_brief_description, events.description as events_description, events.links as events_links, events.website_link as events_website_link, events.ticket_link as events_ticket_link, events.fb_event_link as events_fb_event_link, events.eventbrite_link as events_eventbrite_link, events.bitly_link as events_bitly_link, events.tags as events_tags, events.verified as events_verified, events."createdAt" as events_createdAt, events."updatedAt" as events_updatedAt, events.reviewed_by_org as events_reviewed_by_org, events.accessibility as events_accessibility, events.category as events_category, events.condition as events_condition, events.mode as events_mode, venues.id as venues_id, venues.name as venues_name, venues.slug as venues_slug, venues.address as venues_address, venues.g_map_link as venues_g_map_link, venues."createdAt" as venues_createdAt, venues."updatedAt" as venues_updatedAt, venues.is_soft_deleted as venues_is_soft_deleted, venues.gps_lat as venues_gps_lat, venues.gps_long as venues_gps_long, venues.gps_alt as venues_gps_alt, venues.street as venues_street, venues.city as venues_city, venues.state as venues_state, venues.zip as venues_zip, venues.neighborhood as venues_neighborhood FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01'; +# +""" +-- DB table info + +CREATE TABLE datetime_venue ( + id uuid NOT NULL, + event_id uuid NOT NULL, + venue_id uuid, + start_time timestamp with time zone NOT NULL, + end_time timestamp with time zone NOT NULL, + optional_title character varying(255), + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + timezone character varying(255) DEFAULT 'US/Eastern'::character varying NOT NULL, + category character varying(255) +); + + +CREATE TABLE events ( + id uuid DEFAULT uuid_generate_v4() NOT NULL, + title character varying(255) NOT NULL, + slug character varying(255) NOT NULL, + multi_day boolean, + image character varying(255), + social_image character varying(255), + venue_id uuid, + admission_fee character varying(255), + organizer_contact character varying(255), + brief_description text, + description text, + links character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + website_link text, + ticket_link text, + fb_event_link character varying(255), + eventbrite_link character varying(255), + bitly_link character varying(255), + tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + verified boolean DEFAULT false NOT NULL, + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + reviewed_by_org character varying, + accessibility character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + category character varying(255), + condition character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[], + mode character varying(255) +); + + + +CREATE TABLE venues ( + id uuid DEFAULT uuid_generate_v4() NOT NULL, + name character varying(255), + slug character varying(255), + address character varying(255), + g_map_link character varying(255), + "createdAt" timestamp with time zone DEFAULT now() NOT NULL, + "updatedAt" timestamp with time zone DEFAULT now() NOT NULL, + is_soft_deleted boolean DEFAULT false NOT NULL, + gps_lat double precision, + gps_long double precision, + gps_alt double precision, + street character varying(255) DEFAULT NULL::character varying, + city character varying(255) DEFAULT NULL::character varying, + state character varying(255) DEFAULT NULL::character varying, + zip character varying(255) DEFAULT NULL::character varying, + neighborhood character varying(255) DEFAULT NULL::character varying +); +""" diff --git a/notes.md b/notes.md new file mode 100644 index 0000000..2db48dc --- /dev/null +++ b/notes.md @@ -0,0 +1,20 @@ +## Notes + +### 2024-07-19 + +The column name is case insensitive in the dataframe. The column name in the +dataframe is lower case even though the SQL query alias uses upper case (like +`venue_updatedAt` vs `venue_updatedat`) + +**Which column is events_multi_day?** +``` +cat events.old.csv | head -n 1 | awk -F',' '{for(i=1; i<=NF; i++) if($i == "events_multi_day") print i}' +``` + +**Count multi-day events.** +``` +$ zcat events.csv.gz |awk -F',' '$10 == "True"' | wc -l +3779 +$ zcat events.csv.gz |awk -F',' '$10 == "False"' | wc -l +2559 +```