initial commit

This commit is contained in:
Jason Swank 2024-07-19 21:55:07 -04:00
commit bb9a0ed907
3 changed files with 289 additions and 0 deletions

63
db.sql Normal file
View File

@ -0,0 +1,63 @@
CREATE TABLE events (
id uuid DEFAULT uuid_generate_v4() NOT NULL,
title character varying(255) NOT NULL,
slug character varying(255) NOT NULL,
multi_day boolean,
image character varying(255),
social_image character varying(255),
venue_id uuid,
admission_fee character varying(255),
organizer_contact character varying(255),
brief_description text,
description text,
links character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
website_link text,
ticket_link text,
fb_event_link character varying(255),
eventbrite_link character varying(255),
bitly_link character varying(255),
tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
verified boolean DEFAULT false NOT NULL,
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
reviewed_by_org character varying,
accessibility character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
category character varying(255),
condition character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
mode character varying(255)
);
CREATE TABLE datetime_venue (
id uuid NOT NULL,
event_id uuid NOT NULL,
venue_id uuid,
start_time timestamp with time zone NOT NULL,
end_time timestamp with time zone NOT NULL,
optional_title character varying(255),
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
timezone character varying(255) DEFAULT 'US/Eastern'::character varying NOT NULL,
category character varying(255)
);
CREATE TABLE venues (
id uuid DEFAULT uuid_generate_v4() NOT NULL,
name character varying(255),
slug character varying(255),
address character varying(255),
g_map_link character varying(255),
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
is_soft_deleted boolean DEFAULT false NOT NULL,
gps_lat double precision,
gps_long double precision,
gps_alt double precision,
street character varying(255) DEFAULT NULL::character varying,
city character varying(255) DEFAULT NULL::character varying,
state character varying(255) DEFAULT NULL::character varying,
zip character varying(255) DEFAULT NULL::character varying,
neighborhood character varying(255) DEFAULT NULL::character varying
);

206
dump-sql.py Executable file
View File

@ -0,0 +1,206 @@
#!/usr/bin/env python3
# Run Postegres SQL statement which combines information from three tables into
# a single set of results.
#!/usr/bin/env python3
from sqlalchemy import create_engine
import pandas as pd
import os
pguser = os.getenv('PGUSER', 'postgres')
pgpassword = os.getenv('PGPASSWORD', 'xxx')
pghost = os.getenv('PGHOST', 'db')
pgport = os.getenv('PGPORT', '5432')
pgdatabase = os.getenv('PGDATABASE', 'infinite-api')
# Connect to the Postgres database
engine = create_engine(f"postgresql+psycopg2://{pguser}:{pgpassword}@{pghost}:{pgport}/{pgdatabase}")
# The SQL statement to combine information from three tables is:
sql="""
SELECT
datetime_venue.id as datetime_venue_id,
datetime_venue.event_id as datetime_venue_event_id,
datetime_venue.venue_id as datetime_venue_venue_id,
datetime_venue.start_time as datetime_venue_start_time,
datetime_venue.end_time as datetime_venue_end_time,
datetime_venue.optional_title as datetime_venue_optional_title,
datetime_venue."createdAt" as datetime_venue_createdAt,
datetime_venue."updatedAt" as datetime_venue_updatedAt,
datetime_venue.timezone as datetime_venue_timezone,
datetime_venue.category as datetime_venue_category,
events.id as events_id,
events.title as events_title,
events.slug as events_slug,
events.multi_day as events_multi_day,
events.image as events_image,
events.social_image as events_social_image,
events.venue_id as events_venue_id,
events.admission_fee as events_admission_fee,
events.organizer_contact as events_organizer_contact,
events.brief_description as events_brief_description,
events.description as events_description,
events.links as events_links,
events.website_link as events_website_link,
events.ticket_link as events_ticket_link,
events.fb_event_link as events_fb_event_link,
events.eventbrite_link as events_eventbrite_link,
events.bitly_link as events_bitly_link,
events.tags as events_tags,
events.verified as events_verified,
events."createdAt" as events_createdAt,
events."updatedAt" as events_updatedAt,
events.reviewed_by_org as events_reviewed_by_org,
events.accessibility as events_accessibility,
events.category as events_category,
events.condition as events_condition,
events.mode as events_mode,
venues.id as venues_id,
venues.name as venues_name,
venues.slug as venues_slug,
venues.address as venues_address,
venues.g_map_link as venues_g_map_link,
venues."createdAt" as venues_createdAt,
venues."updatedAt" as venues_updatedAt,
venues.is_soft_deleted as venues_is_soft_deleted,
venues.gps_lat as venues_gps_lat,
venues.gps_long as venues_gps_long,
venues.gps_alt as venues_gps_alt,
venues.street as venues_street,
venues.city as venues_city,
venues.state as venues_state,
venues.zip as venues_zip,
venues.neighborhood as venues_neighborhood
FROM
datetime_venue
JOIN
events
ON datetime_venue.event_id = events.id
JOIN
venues
ON datetime_venue.venue_id = venues.id
WHERE
datetime_venue.start_time >= '2022-01-01'
ORDER BY datetime_venue_start_time DESC;
"""
df = pd.read_sql(sql, engine)
# list all the columns in the dataframe
print(df.columns)
#
# Update the 'events_multi_day' column to indicate whether an event spans multiple days
#
# Group the dataframe by 'datetime_venue_event_id'
grouped = df.groupby('datetime_venue_event_id')
# Iterate over each group
for event_id, group in grouped:
if len(group) > 1:
# Update 'events_multi_day' column for events with multiple rows
df.loc[df['datetime_venue_event_id'] == event_id, 'events_multi_day'] = True
# Drop duplicate columns like venue_id, event_id, etc.
# These columns are not needed since we have the unique identifier 'datetime_venue_id' for each row.
df = df.drop(columns=['datetime_venue_event_id', 'datetime_venue_venue_id', 'events_venue_id', 'datetime_venue_updatedat', 'events_updatedat', 'venues_updatedat', 'datetime_venue_createdat', 'events_createdat', 'venues_createdat'])
# Print the first few rows of the dataframe
print(df.head())
# Save the dataframe to a CSV file
df.to_csv('events.csv', index=False, encoding='utf-8')
# SQL statement to combine information from three tables:
# datetime_venue has a row for each event date and venue
# events has a row containing metadata for each event
# venues has a row containing metadata for each venue
#
# an example SQL query is:
#
# SELECT datetime_venue.start_time, datetime_venue.end_time, events.title, venues.name FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id;
#
# The query we will return include all columns from the three tables.
#
# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id;
#
# We only care about events since the start of 2022.
#
# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01';
#
# Since this data will be imported elsewhere, we want to be sure that each column is named uniquely.
# We will prefix each column with the table name.
#
# SELECT datetime_venue.id as datetime_venue_id,datetime_venue.event_id as datetime_venue_event_id, datetime_venue.venue_id as datetime_venue_venue_id, datetime_venue.start_time as datetime_venue_start_time, datetime_venue.end_time as datetime_venue_end_time, datetime_venue.optional_title as datetime_venue_optional_title, datetime_venue."createdAt" as datetime_venue_createdAt, datetime_venue."updatedAt" as datetime_venue_updatedAt, datetime_venue.timezone as datetime_venue_timezone, datetime_venue.category as datetime_venue_category, events.id as events_id, events.title as events_title, events.slug as events_slug, events.multi_day as events_multi_day, events.image as events_image, events.social_image as events_social_image, events.venue_id as events_venue_id, events.admission_fee as events_admission_fee, events.organizer_contact as events_organizer_contact, events.brief_description as events_brief_description, events.description as events_description, events.links as events_links, events.website_link as events_website_link, events.ticket_link as events_ticket_link, events.fb_event_link as events_fb_event_link, events.eventbrite_link as events_eventbrite_link, events.bitly_link as events_bitly_link, events.tags as events_tags, events.verified as events_verified, events."createdAt" as events_createdAt, events."updatedAt" as events_updatedAt, events.reviewed_by_org as events_reviewed_by_org, events.accessibility as events_accessibility, events.category as events_category, events.condition as events_condition, events.mode as events_mode, venues.id as venues_id, venues.name as venues_name, venues.slug as venues_slug, venues.address as venues_address, venues.g_map_link as venues_g_map_link, venues."createdAt" as venues_createdAt, venues."updatedAt" as venues_updatedAt, venues.is_soft_deleted as venues_is_soft_deleted, venues.gps_lat as venues_gps_lat, venues.gps_long as venues_gps_long, venues.gps_alt as venues_gps_alt, venues.street as venues_street, venues.city as venues_city, venues.state as venues_state, venues.zip as venues_zip, venues.neighborhood as venues_neighborhood FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01';
#
"""
-- DB table info
CREATE TABLE datetime_venue (
id uuid NOT NULL,
event_id uuid NOT NULL,
venue_id uuid,
start_time timestamp with time zone NOT NULL,
end_time timestamp with time zone NOT NULL,
optional_title character varying(255),
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
timezone character varying(255) DEFAULT 'US/Eastern'::character varying NOT NULL,
category character varying(255)
);
CREATE TABLE events (
id uuid DEFAULT uuid_generate_v4() NOT NULL,
title character varying(255) NOT NULL,
slug character varying(255) NOT NULL,
multi_day boolean,
image character varying(255),
social_image character varying(255),
venue_id uuid,
admission_fee character varying(255),
organizer_contact character varying(255),
brief_description text,
description text,
links character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
website_link text,
ticket_link text,
fb_event_link character varying(255),
eventbrite_link character varying(255),
bitly_link character varying(255),
tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
verified boolean DEFAULT false NOT NULL,
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
reviewed_by_org character varying,
accessibility character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
category character varying(255),
condition character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
mode character varying(255)
);
CREATE TABLE venues (
id uuid DEFAULT uuid_generate_v4() NOT NULL,
name character varying(255),
slug character varying(255),
address character varying(255),
g_map_link character varying(255),
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
is_soft_deleted boolean DEFAULT false NOT NULL,
gps_lat double precision,
gps_long double precision,
gps_alt double precision,
street character varying(255) DEFAULT NULL::character varying,
city character varying(255) DEFAULT NULL::character varying,
state character varying(255) DEFAULT NULL::character varying,
zip character varying(255) DEFAULT NULL::character varying,
neighborhood character varying(255) DEFAULT NULL::character varying
);
"""

20
notes.md Normal file
View File

@ -0,0 +1,20 @@
## Notes
### 2024-07-19
The column name is case insensitive in the dataframe. The column name in the
dataframe is lower case even though the SQL query alias uses upper case (like
`venue_updatedAt` vs `venue_updatedat`)
**Which column is events_multi_day?**
```
cat events.old.csv | head -n 1 | awk -F',' '{for(i=1; i<=NF; i++) if($i == "events_multi_day") print i}'
```
**Count multi-day events.**
```
$ zcat events.csv.gz |awk -F',' '$10 == "True"' | wc -l
3779
$ zcat events.csv.gz |awk -F',' '$10 == "False"' | wc -l
2559
```