finished readme and moved terraform fies

This commit is contained in:
Dallin Bentley 2024-09-10 20:50:46 -06:00
parent 00d189a99b
commit e8af784283
6 changed files with 103 additions and 30 deletions

View File

@ -38,8 +38,74 @@ Additionaly, we found that having a close integration between the data warehouse
- **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine.
## Quickstart
Have
1. Dependencies:
- Make sure that you have [Docker Engine](https://docs.docker.com/engine/install/) installed.
- Install [Python](https://www.python.org/downloads/) if you haven't already.
- Install a [MySQL client](https://dev.mysql.com/downloads/mysql/) on your system.
- An AWS account with S3 access.
2. Clone the repository:
```bash
git clone https://github.com/buster-so/warehouse.git
```
3. Run the warehouse:
```bash
docker compose up -d
```
4. Populate the `.env` file with AWS credentials provisioned for S3 access. **Note: You can use any S3 compatible storage, you might just need to tweak some of the configs.** Feel free to look at the Starrocks [docs](https://docs.starrocks.com/en-us/main/loading/iceberg/iceberg_external_catalog) or PyIceberg [docs](https://iceberg.apache.org/docs/latest/spark-configuration/) for more information.
5. Connect to the warehouse with any MySQL client.
6. Create the external catalog:
```sql
CREATE EXTERNAL CATALOG 'public'
PROPERTIES
(
"type"="iceberg",
"iceberg.catalog.type"="rest",
"iceberg.catalog.uri"="http://iceberg-rest:8181",
"iceberg.catalog.warehouse"="<BUCKET_NAME>",
"aws.s3.access_key"="<ACCESS_KEY>",
"aws.s3.secret_key"="<SECRET_KEY>",
"aws.s3.region" = "<REGION>",
"aws.s3.enable_path_style_access"="true",
"client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
);
```
7. Seed the data. If you want to populate a table with 75m records, you can run the notebook found [here](/notebooks/populate_warehouse.ipynb).
8. Set the catalog
```sql
SET CATALOG 'public';
```
9. Set the database
```sql
USE DATABASE 'public';
```
10. Run a query
```sql
SELECT COUNT(*) FROM public.nyc_taxi;
```
### Optimizations
For data that you think will be accessed frequently, you can cache it on disk for faster access with:
```sql
CACHE SELECT * FROM public.nyc_taxi WHERE tpep_pickup_datetime > '2022-03-01';
```
## Roadmap

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 162,
"execution_count": 1,
"metadata": {},
"outputs": [
{
@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 163,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 164,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -37,7 +37,7 @@
},
{
"cell_type": "code",
"execution_count": 165,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 166,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -57,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 167,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 168,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@ -109,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 169,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -123,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 170,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@ -143,14 +143,21 @@
},
{
"cell_type": "code",
"execution_count": 171,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]\n"
"Appending files: 0%| | 0/26 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Appending files: 100%|██████████| 26/26 [03:56<00:00, 9.10s/it, Appended 2022-12]\n"
]
},
{
@ -162,35 +169,35 @@
}
],
"source": [
"import os\n",
"import requests\n",
"import io\n",
"import pyarrow.parquet as pq\n",
"from tqdm import tqdm\n",
"from datetime import datetime, timedelta\n",
"\n",
"# GitHub repository information\n",
"repo_owner = \"buster-so\"\n",
"repo_name = \"sample-data\"\n",
"folder_path = \"nyc_taxi\"\n",
"# Base URL for the Parquet files\n",
"base_url = \"https://pub-f6a668561f5e4bd6ac651efd8c18998d.r2.dev/nyc_taxi/yellow_tripdata_{}.parquet\"\n",
"\n",
"# GitHub API endpoint to get repository contents\n",
"api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}\"\n",
"# Generate a list of dates from 2020-11 to 2022-12\n",
"start_date = datetime(2020, 11, 1)\n",
"end_date = datetime(2022, 12, 1)\n",
"date_list = []\n",
"\n",
"# Fetch the list of files in the repository\n",
"response = requests.get(api_url)\n",
"if response.status_code != 200:\n",
" raise Exception(f\"Failed to fetch repository contents: {response.status_code}\")\n",
"\n",
"files = [item for item in response.json() if item['name'].endswith('.parquet')]\n",
"current_date = start_date\n",
"while current_date <= end_date:\n",
" date_list.append(current_date.strftime(\"%Y-%m\"))\n",
" current_date += timedelta(days=32)\n",
" current_date = current_date.replace(day=1)\n",
"\n",
"# Create a progress bar\n",
"with tqdm(total=len(files), desc=\"Appending files\") as pbar:\n",
" for file in files:\n",
"with tqdm(total=len(date_list), desc=\"Appending files\") as pbar:\n",
" for date_str in date_list:\n",
" file_url = base_url.format(date_str)\n",
" \n",
" # Download the file content\n",
" file_url = file['download_url']\n",
" file_response = requests.get(file_url)\n",
" if file_response.status_code != 200:\n",
" print(f\"Failed to download {file['name']}: {file_response.status_code}\")\n",
" print(f\"Failed to download {date_str}: {file_response.status_code}\")\n",
" continue\n",
" \n",
" # Read the Parquet file from the response content\n",
@ -201,7 +208,7 @@
" table.append(df)\n",
" \n",
" pbar.update(1)\n",
" pbar.set_postfix_str(f\"Appended {file['name']}\")\n",
" pbar.set_postfix_str(f\"Appended {date_str}\")\n",
"\n",
"# Print the total number of rows in the table after appending all files\n",
"print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")"