finished readme and moved terraform fies

2024-09-10 20:50:46 -06:00 · 2024-09-10 20:50:46 -06:00 · e8af784283
parent 00d189a99b
commit e8af784283
6 changed files with 103 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -38,8 +38,74 @@ Additionaly, we found that having a close integration between the data warehouse
 - **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine.
 ## Quickstart
 Have 
 1. Dependencies:
   - Make sure that you have [Docker Engine](https://docs.docker.com/engine/install/) installed.
   - Install [Python](https://www.python.org/downloads/) if you haven't already.
   - Install a [MySQL client](https://dev.mysql.com/downloads/mysql/) on your system.
   - An AWS account with S3 access.
 2. Clone the repository:
 ```bash
 git clone https://github.com/buster-so/warehouse.git
 ```
 3. Run the warehouse:
 ```bash
 docker compose up -d
 ```
 4. Populate the `.env` file with AWS credentials provisioned for S3 access. **Note: You can use any S3 compatible storage, you might just need to tweak some of the configs.** Feel free to look at the Starrocks [docs](https://docs.starrocks.com/en-us/main/loading/iceberg/iceberg_external_catalog) or PyIceberg [docs](https://iceberg.apache.org/docs/latest/spark-configuration/) for more information.
 5. Connect to the warehouse with any MySQL client.
 6. Create the external catalog:
 ```sql
 CREATE EXTERNAL CATALOG 'public'
 PROPERTIES
 (
  "type"="iceberg",
  "iceberg.catalog.type"="rest",
  "iceberg.catalog.uri"="http://iceberg-rest:8181",
  "iceberg.catalog.warehouse"="<BUCKET_NAME>",
  "aws.s3.access_key"="<ACCESS_KEY>",
  "aws.s3.secret_key"="<SECRET_KEY>",
  "aws.s3.region" = "<REGION>",
  "aws.s3.enable_path_style_access"="true",
  "client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
 );
 ```
 7. Seed the data. If you want to populate a table with 75m records, you can run the notebook found [here](/notebooks/populate_warehouse.ipynb).
 8. Set the catalog
 ```sql
 SET CATALOG 'public';
 ```
 9. Set the database
 ```sql
 USE DATABASE 'public';
 ```
 10. Run a query
 ```sql
 SELECT COUNT(*) FROM public.nyc_taxi;
 ```
 ### Optimizations
 For data that you think will be accessed frequently, you can cache it on disk for faster access with:
 ```sql
 CACHE SELECT * FROM public.nyc_taxi WHERE tpep_pickup_datetime > '2022-03-01';
 ```
 ## Roadmap
--- a/python/populate_warehouse.ipynb
+++ b/python/populate_warehouse.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 162,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
@ -19,7 +19,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -28,7 +28,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 164,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -37,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 166,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 167,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -95,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 168,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -109,7 +109,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 169,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -123,7 +123,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -143,14 +143,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 171,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]\n"
+      "Appending files:   0%|          | 0/26 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Appending files: 100%|██████████| 26/26 [03:56<00:00,  9.10s/it, Appended 2022-12]\n"
     ]
    },
    {
@ -162,35 +169,35 @@
    }
   ],
   "source": [
    "import os\n",
    "import requests\n",
    "import io\n",
    "import pyarrow.parquet as pq\n",
    "from tqdm import tqdm\n",
    "from datetime import datetime, timedelta\n",
    "\n",
-    "# GitHub repository information\n",
+    "# Base URL for the Parquet files\n",
-    "repo_owner = \"buster-so\"\n",
+    "base_url = \"https://pub-f6a668561f5e4bd6ac651efd8c18998d.r2.dev/nyc_taxi/yellow_tripdata_{}.parquet\"\n",
    "repo_name = \"sample-data\"\n",
    "folder_path = \"nyc_taxi\"\n",
    "\n",
-    "# GitHub API endpoint to get repository contents\n",
+    "# Generate a list of dates from 2020-11 to 2022-12\n",
-    "api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}\"\n",
+    "start_date = datetime(2020, 11, 1)\n",
    "end_date = datetime(2022, 12, 1)\n",
    "date_list = []\n",
    "\n",
-    "# Fetch the list of files in the repository\n",
+    "current_date = start_date\n",
-    "response = requests.get(api_url)\n",
+    "while current_date <= end_date:\n",
-    "if response.status_code != 200:\n",
+    "    date_list.append(current_date.strftime(\"%Y-%m\"))\n",
-    "    raise Exception(f\"Failed to fetch repository contents: {response.status_code}\")\n",
+    "    current_date += timedelta(days=32)\n",
-    "\n",
+    "    current_date = current_date.replace(day=1)\n",
    "files = [item for item in response.json() if item['name'].endswith('.parquet')]\n",
    "\n",
    "# Create a progress bar\n",
-    "with tqdm(total=len(files), desc=\"Appending files\") as pbar:\n",
+    "with tqdm(total=len(date_list), desc=\"Appending files\") as pbar:\n",
-    "    for file in files:\n",
+    "    for date_str in date_list:\n",
    "        file_url = base_url.format(date_str)\n",
    "        \n",
    "        # Download the file content\n",
    "        file_url = file['download_url']\n",
    "        file_response = requests.get(file_url)\n",
    "        if file_response.status_code != 200:\n",
-    "            print(f\"Failed to download {file['name']}: {file_response.status_code}\")\n",
+    "            print(f\"Failed to download {date_str}: {file_response.status_code}\")\n",
    "            continue\n",
    "        \n",
    "        # Read the Parquet file from the response content\n",
@ -201,7 +208,7 @@
    "        table.append(df)\n",
    "        \n",
    "        pbar.update(1)\n",
-    "        pbar.set_postfix_str(f\"Appended {file['name']}\")\n",
+    "        pbar.set_postfix_str(f\"Appended {date_str}\")\n",
    "\n",
    "# Print the total number of rows in the table after appending all files\n",
    "print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")"
--- a/terraform/main.tf
+++ b/terraform/main.tf
--- a/terraform/outputs.tf
+++ b/terraform/outputs.tf
--- a/terraform/providers.tf
+++ b/terraform/providers.tf
--- a/terraform/variables.tf
+++ b/terraform/variables.tf