finished readme and moved terraform fies

This commit is contained in:
Dallin Bentley 2024-09-10 20:50:46 -06:00
parent 00d189a99b
commit e8af784283
6 changed files with 103 additions and 30 deletions

View File

@ -38,8 +38,74 @@ Additionaly, we found that having a close integration between the data warehouse
- **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine. - **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine.
## Quickstart ## Quickstart
Have
1. Dependencies:
- Make sure that you have [Docker Engine](https://docs.docker.com/engine/install/) installed.
- Install [Python](https://www.python.org/downloads/) if you haven't already.
- Install a [MySQL client](https://dev.mysql.com/downloads/mysql/) on your system.
- An AWS account with S3 access.
2. Clone the repository:
```bash
git clone https://github.com/buster-so/warehouse.git
```
3. Run the warehouse:
```bash
docker compose up -d
```
4. Populate the `.env` file with AWS credentials provisioned for S3 access. **Note: You can use any S3 compatible storage, you might just need to tweak some of the configs.** Feel free to look at the Starrocks [docs](https://docs.starrocks.com/en-us/main/loading/iceberg/iceberg_external_catalog) or PyIceberg [docs](https://iceberg.apache.org/docs/latest/spark-configuration/) for more information.
5. Connect to the warehouse with any MySQL client.
6. Create the external catalog:
```sql
CREATE EXTERNAL CATALOG 'public'
PROPERTIES
(
"type"="iceberg",
"iceberg.catalog.type"="rest",
"iceberg.catalog.uri"="http://iceberg-rest:8181",
"iceberg.catalog.warehouse"="<BUCKET_NAME>",
"aws.s3.access_key"="<ACCESS_KEY>",
"aws.s3.secret_key"="<SECRET_KEY>",
"aws.s3.region" = "<REGION>",
"aws.s3.enable_path_style_access"="true",
"client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
);
```
7. Seed the data. If you want to populate a table with 75m records, you can run the notebook found [here](/notebooks/populate_warehouse.ipynb).
8. Set the catalog
```sql
SET CATALOG 'public';
```
9. Set the database
```sql
USE DATABASE 'public';
```
10. Run a query
```sql
SELECT COUNT(*) FROM public.nyc_taxi;
```
### Optimizations
For data that you think will be accessed frequently, you can cache it on disk for faster access with:
```sql
CACHE SELECT * FROM public.nyc_taxi WHERE tpep_pickup_datetime > '2022-03-01';
```
## Roadmap ## Roadmap

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 162, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -19,7 +19,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 163, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -28,7 +28,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 164, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -37,7 +37,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 165, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -48,7 +48,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 166, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -57,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 167, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -95,7 +95,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 168, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -109,7 +109,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 169, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -123,7 +123,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 170, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -143,14 +143,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 171, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]\n" "Appending files: 0%| | 0/26 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Appending files: 100%|██████████| 26/26 [03:56<00:00, 9.10s/it, Appended 2022-12]\n"
] ]
}, },
{ {
@ -162,35 +169,35 @@
} }
], ],
"source": [ "source": [
"import os\n",
"import requests\n", "import requests\n",
"import io\n", "import io\n",
"import pyarrow.parquet as pq\n", "import pyarrow.parquet as pq\n",
"from tqdm import tqdm\n", "from tqdm import tqdm\n",
"from datetime import datetime, timedelta\n",
"\n", "\n",
"# GitHub repository information\n", "# Base URL for the Parquet files\n",
"repo_owner = \"buster-so\"\n", "base_url = \"https://pub-f6a668561f5e4bd6ac651efd8c18998d.r2.dev/nyc_taxi/yellow_tripdata_{}.parquet\"\n",
"repo_name = \"sample-data\"\n",
"folder_path = \"nyc_taxi\"\n",
"\n", "\n",
"# GitHub API endpoint to get repository contents\n", "# Generate a list of dates from 2020-11 to 2022-12\n",
"api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}\"\n", "start_date = datetime(2020, 11, 1)\n",
"end_date = datetime(2022, 12, 1)\n",
"date_list = []\n",
"\n", "\n",
"# Fetch the list of files in the repository\n", "current_date = start_date\n",
"response = requests.get(api_url)\n", "while current_date <= end_date:\n",
"if response.status_code != 200:\n", " date_list.append(current_date.strftime(\"%Y-%m\"))\n",
" raise Exception(f\"Failed to fetch repository contents: {response.status_code}\")\n", " current_date += timedelta(days=32)\n",
"\n", " current_date = current_date.replace(day=1)\n",
"files = [item for item in response.json() if item['name'].endswith('.parquet')]\n",
"\n", "\n",
"# Create a progress bar\n", "# Create a progress bar\n",
"with tqdm(total=len(files), desc=\"Appending files\") as pbar:\n", "with tqdm(total=len(date_list), desc=\"Appending files\") as pbar:\n",
" for file in files:\n", " for date_str in date_list:\n",
" file_url = base_url.format(date_str)\n",
" \n",
" # Download the file content\n", " # Download the file content\n",
" file_url = file['download_url']\n",
" file_response = requests.get(file_url)\n", " file_response = requests.get(file_url)\n",
" if file_response.status_code != 200:\n", " if file_response.status_code != 200:\n",
" print(f\"Failed to download {file['name']}: {file_response.status_code}\")\n", " print(f\"Failed to download {date_str}: {file_response.status_code}\")\n",
" continue\n", " continue\n",
" \n", " \n",
" # Read the Parquet file from the response content\n", " # Read the Parquet file from the response content\n",
@ -201,7 +208,7 @@
" table.append(df)\n", " table.append(df)\n",
" \n", " \n",
" pbar.update(1)\n", " pbar.update(1)\n",
" pbar.set_postfix_str(f\"Appended {file['name']}\")\n", " pbar.set_postfix_str(f\"Appended {date_str}\")\n",
"\n", "\n",
"# Print the total number of rows in the table after appending all files\n", "# Print the total number of rows in the table after appending all files\n",
"print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")" "print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")"