207 lines
10 KiB
Python
Executable File
207 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Run Postegres SQL statement which combines information from three tables into
|
|
# a single set of results.
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
from sqlalchemy import create_engine
|
|
import pandas as pd
|
|
import os
|
|
|
|
pguser = os.getenv('PGUSER', 'postgres')
|
|
pgpassword = os.getenv('PGPASSWORD', 'xxx')
|
|
pghost = os.getenv('PGHOST', 'db')
|
|
pgport = os.getenv('PGPORT', '5432')
|
|
pgdatabase = os.getenv('PGDATABASE', 'infinite-api')
|
|
|
|
# Connect to the Postgres database
|
|
engine = create_engine(f"postgresql+psycopg2://{pguser}:{pgpassword}@{pghost}:{pgport}/{pgdatabase}")
|
|
|
|
# The SQL statement to combine information from three tables is:
|
|
sql="""
|
|
SELECT
|
|
datetime_venue.id as datetime_venue_id,
|
|
datetime_venue.event_id as datetime_venue_event_id,
|
|
datetime_venue.venue_id as datetime_venue_venue_id,
|
|
datetime_venue.start_time as datetime_venue_start_time,
|
|
datetime_venue.end_time as datetime_venue_end_time,
|
|
datetime_venue.optional_title as datetime_venue_optional_title,
|
|
datetime_venue."createdAt" as datetime_venue_createdAt,
|
|
datetime_venue."updatedAt" as datetime_venue_updatedAt,
|
|
datetime_venue.timezone as datetime_venue_timezone,
|
|
datetime_venue.category as datetime_venue_category,
|
|
events.id as events_id,
|
|
events.title as events_title,
|
|
events.slug as events_slug,
|
|
events.multi_day as events_multi_day,
|
|
events.image as events_image,
|
|
events.social_image as events_social_image,
|
|
events.venue_id as events_venue_id,
|
|
events.admission_fee as events_admission_fee,
|
|
events.organizer_contact as events_organizer_contact,
|
|
events.brief_description as events_brief_description,
|
|
events.description as events_description,
|
|
events.links as events_links,
|
|
events.website_link as events_website_link,
|
|
events.ticket_link as events_ticket_link,
|
|
events.fb_event_link as events_fb_event_link,
|
|
events.eventbrite_link as events_eventbrite_link,
|
|
events.bitly_link as events_bitly_link,
|
|
events.tags as events_tags,
|
|
events.verified as events_verified,
|
|
events."createdAt" as events_createdAt,
|
|
events."updatedAt" as events_updatedAt,
|
|
events.reviewed_by_org as events_reviewed_by_org,
|
|
events.accessibility as events_accessibility,
|
|
events.category as events_category,
|
|
events.condition as events_condition,
|
|
events.mode as events_mode,
|
|
venues.id as venues_id,
|
|
venues.name as venues_name,
|
|
venues.slug as venues_slug,
|
|
venues.address as venues_address,
|
|
venues.g_map_link as venues_g_map_link,
|
|
venues."createdAt" as venues_createdAt,
|
|
venues."updatedAt" as venues_updatedAt,
|
|
venues.is_soft_deleted as venues_is_soft_deleted,
|
|
venues.gps_lat as venues_gps_lat,
|
|
venues.gps_long as venues_gps_long,
|
|
venues.gps_alt as venues_gps_alt,
|
|
venues.street as venues_street,
|
|
venues.city as venues_city,
|
|
venues.state as venues_state,
|
|
venues.zip as venues_zip,
|
|
venues.neighborhood as venues_neighborhood
|
|
FROM
|
|
datetime_venue
|
|
JOIN
|
|
events
|
|
ON datetime_venue.event_id = events.id
|
|
JOIN
|
|
venues
|
|
ON datetime_venue.venue_id = venues.id
|
|
WHERE
|
|
datetime_venue.start_time >= '2022-01-01'
|
|
ORDER BY datetime_venue_start_time DESC;
|
|
"""
|
|
|
|
df = pd.read_sql(sql, engine)
|
|
|
|
# list all the columns in the dataframe
|
|
print(df.columns)
|
|
|
|
#
|
|
# Update the 'events_multi_day' column to indicate whether an event spans multiple days
|
|
#
|
|
# Group the dataframe by 'datetime_venue_event_id'
|
|
grouped = df.groupby('datetime_venue_event_id')
|
|
|
|
# Iterate over each group
|
|
for event_id, group in grouped:
|
|
if len(group) > 1:
|
|
# Update 'events_multi_day' column for events with multiple rows
|
|
df.loc[df['datetime_venue_event_id'] == event_id, 'events_multi_day'] = True
|
|
|
|
|
|
|
|
# Drop duplicate columns like venue_id, event_id, etc.
|
|
# These columns are not needed since we have the unique identifier 'datetime_venue_id' for each row.
|
|
df = df.drop(columns=['datetime_venue_event_id', 'datetime_venue_venue_id', 'events_venue_id', 'datetime_venue_updatedat', 'events_updatedat', 'venues_updatedat', 'datetime_venue_createdat', 'events_createdat', 'venues_createdat'])
|
|
|
|
# Print the first few rows of the dataframe
|
|
print(df.head())
|
|
|
|
# Save the dataframe to a CSV file
|
|
df.to_csv('events.csv', index=False, encoding='utf-8')
|
|
|
|
# SQL statement to combine information from three tables:
|
|
# datetime_venue has a row for each event date and venue
|
|
# events has a row containing metadata for each event
|
|
# venues has a row containing metadata for each venue
|
|
#
|
|
# an example SQL query is:
|
|
#
|
|
# SELECT datetime_venue.start_time, datetime_venue.end_time, events.title, venues.name FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id;
|
|
#
|
|
# The query we will return include all columns from the three tables.
|
|
#
|
|
# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id;
|
|
#
|
|
# We only care about events since the start of 2022.
|
|
#
|
|
# SELECT * FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01';
|
|
#
|
|
# Since this data will be imported elsewhere, we want to be sure that each column is named uniquely.
|
|
# We will prefix each column with the table name.
|
|
#
|
|
# SELECT datetime_venue.id as datetime_venue_id,datetime_venue.event_id as datetime_venue_event_id, datetime_venue.venue_id as datetime_venue_venue_id, datetime_venue.start_time as datetime_venue_start_time, datetime_venue.end_time as datetime_venue_end_time, datetime_venue.optional_title as datetime_venue_optional_title, datetime_venue."createdAt" as datetime_venue_createdAt, datetime_venue."updatedAt" as datetime_venue_updatedAt, datetime_venue.timezone as datetime_venue_timezone, datetime_venue.category as datetime_venue_category, events.id as events_id, events.title as events_title, events.slug as events_slug, events.multi_day as events_multi_day, events.image as events_image, events.social_image as events_social_image, events.venue_id as events_venue_id, events.admission_fee as events_admission_fee, events.organizer_contact as events_organizer_contact, events.brief_description as events_brief_description, events.description as events_description, events.links as events_links, events.website_link as events_website_link, events.ticket_link as events_ticket_link, events.fb_event_link as events_fb_event_link, events.eventbrite_link as events_eventbrite_link, events.bitly_link as events_bitly_link, events.tags as events_tags, events.verified as events_verified, events."createdAt" as events_createdAt, events."updatedAt" as events_updatedAt, events.reviewed_by_org as events_reviewed_by_org, events.accessibility as events_accessibility, events.category as events_category, events.condition as events_condition, events.mode as events_mode, venues.id as venues_id, venues.name as venues_name, venues.slug as venues_slug, venues.address as venues_address, venues.g_map_link as venues_g_map_link, venues."createdAt" as venues_createdAt, venues."updatedAt" as venues_updatedAt, venues.is_soft_deleted as venues_is_soft_deleted, venues.gps_lat as venues_gps_lat, venues.gps_long as venues_gps_long, venues.gps_alt as venues_gps_alt, venues.street as venues_street, venues.city as venues_city, venues.state as venues_state, venues.zip as venues_zip, venues.neighborhood as venues_neighborhood FROM datetime_venue JOIN events ON datetime_venue.event_id = events.id JOIN venues ON datetime_venue.venue_id = venues.id WHERE datetime_venue.start_time >= '2022-01-01';
|
|
#
|
|
"""
|
|
-- DB table info
|
|
|
|
CREATE TABLE datetime_venue (
|
|
id uuid NOT NULL,
|
|
event_id uuid NOT NULL,
|
|
venue_id uuid,
|
|
start_time timestamp with time zone NOT NULL,
|
|
end_time timestamp with time zone NOT NULL,
|
|
optional_title character varying(255),
|
|
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
timezone character varying(255) DEFAULT 'US/Eastern'::character varying NOT NULL,
|
|
category character varying(255)
|
|
);
|
|
|
|
|
|
CREATE TABLE events (
|
|
id uuid DEFAULT uuid_generate_v4() NOT NULL,
|
|
title character varying(255) NOT NULL,
|
|
slug character varying(255) NOT NULL,
|
|
multi_day boolean,
|
|
image character varying(255),
|
|
social_image character varying(255),
|
|
venue_id uuid,
|
|
admission_fee character varying(255),
|
|
organizer_contact character varying(255),
|
|
brief_description text,
|
|
description text,
|
|
links character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
|
|
website_link text,
|
|
ticket_link text,
|
|
fb_event_link character varying(255),
|
|
eventbrite_link character varying(255),
|
|
bitly_link character varying(255),
|
|
tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
|
|
verified boolean DEFAULT false NOT NULL,
|
|
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
reviewed_by_org character varying,
|
|
accessibility character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
|
|
category character varying(255),
|
|
condition character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[],
|
|
mode character varying(255)
|
|
);
|
|
|
|
|
|
|
|
CREATE TABLE venues (
|
|
id uuid DEFAULT uuid_generate_v4() NOT NULL,
|
|
name character varying(255),
|
|
slug character varying(255),
|
|
address character varying(255),
|
|
g_map_link character varying(255),
|
|
"createdAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
"updatedAt" timestamp with time zone DEFAULT now() NOT NULL,
|
|
is_soft_deleted boolean DEFAULT false NOT NULL,
|
|
gps_lat double precision,
|
|
gps_long double precision,
|
|
gps_alt double precision,
|
|
street character varying(255) DEFAULT NULL::character varying,
|
|
city character varying(255) DEFAULT NULL::character varying,
|
|
state character varying(255) DEFAULT NULL::character varying,
|
|
zip character varying(255) DEFAULT NULL::character varying,
|
|
neighborhood character varying(255) DEFAULT NULL::character varying
|
|
);
|
|
"""
|