mirror of https://github.com/buster-so/buster.git
finished readme and moved terraform fies
This commit is contained in:
parent
00d189a99b
commit
e8af784283
68
README.md
68
README.md
|
@ -38,8 +38,74 @@ Additionaly, we found that having a close integration between the data warehouse
|
|||
- **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine.
|
||||
|
||||
## Quickstart
|
||||
Have
|
||||
|
||||
1. Dependencies:
|
||||
- Make sure that you have [Docker Engine](https://docs.docker.com/engine/install/) installed.
|
||||
- Install [Python](https://www.python.org/downloads/) if you haven't already.
|
||||
- Install a [MySQL client](https://dev.mysql.com/downloads/mysql/) on your system.
|
||||
- An AWS account with S3 access.
|
||||
|
||||
2. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/buster-so/warehouse.git
|
||||
```
|
||||
|
||||
3. Run the warehouse:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
4. Populate the `.env` file with AWS credentials provisioned for S3 access. **Note: You can use any S3 compatible storage, you might just need to tweak some of the configs.** Feel free to look at the Starrocks [docs](https://docs.starrocks.com/en-us/main/loading/iceberg/iceberg_external_catalog) or PyIceberg [docs](https://iceberg.apache.org/docs/latest/spark-configuration/) for more information.
|
||||
|
||||
5. Connect to the warehouse with any MySQL client.
|
||||
|
||||
6. Create the external catalog:
|
||||
|
||||
```sql
|
||||
CREATE EXTERNAL CATALOG 'public'
|
||||
PROPERTIES
|
||||
(
|
||||
"type"="iceberg",
|
||||
"iceberg.catalog.type"="rest",
|
||||
"iceberg.catalog.uri"="http://iceberg-rest:8181",
|
||||
"iceberg.catalog.warehouse"="<BUCKET_NAME>",
|
||||
"aws.s3.access_key"="<ACCESS_KEY>",
|
||||
"aws.s3.secret_key"="<SECRET_KEY>",
|
||||
"aws.s3.region" = "<REGION>",
|
||||
"aws.s3.enable_path_style_access"="true",
|
||||
"client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
|
||||
);
|
||||
```
|
||||
|
||||
7. Seed the data. If you want to populate a table with 75m records, you can run the notebook found [here](/notebooks/populate_warehouse.ipynb).
|
||||
|
||||
8. Set the catalog
|
||||
|
||||
```sql
|
||||
SET CATALOG 'public';
|
||||
```
|
||||
|
||||
9. Set the database
|
||||
|
||||
```sql
|
||||
USE DATABASE 'public';
|
||||
```
|
||||
|
||||
10. Run a query
|
||||
|
||||
```sql
|
||||
SELECT COUNT(*) FROM public.nyc_taxi;
|
||||
```
|
||||
|
||||
### Optimizations
|
||||
|
||||
For data that you think will be accessed frequently, you can cache it on disk for faster access with:
|
||||
|
||||
```sql
|
||||
CACHE SELECT * FROM public.nyc_taxi WHERE tpep_pickup_datetime > '2022-03-01';
|
||||
```
|
||||
|
||||
## Roadmap
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 162,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 163,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -28,7 +28,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 164,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -37,7 +37,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 165,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -48,7 +48,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 166,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -57,7 +57,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 167,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -95,7 +95,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 168,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -109,7 +109,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 169,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -123,7 +123,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 170,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -143,14 +143,21 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 171,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]\n"
|
||||
"Appending files: 0%| | 0/26 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Appending files: 100%|██████████| 26/26 [03:56<00:00, 9.10s/it, Appended 2022-12]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -162,35 +169,35 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"import io\n",
|
||||
"import pyarrow.parquet as pq\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"\n",
|
||||
"# GitHub repository information\n",
|
||||
"repo_owner = \"buster-so\"\n",
|
||||
"repo_name = \"sample-data\"\n",
|
||||
"folder_path = \"nyc_taxi\"\n",
|
||||
"# Base URL for the Parquet files\n",
|
||||
"base_url = \"https://pub-f6a668561f5e4bd6ac651efd8c18998d.r2.dev/nyc_taxi/yellow_tripdata_{}.parquet\"\n",
|
||||
"\n",
|
||||
"# GitHub API endpoint to get repository contents\n",
|
||||
"api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}\"\n",
|
||||
"# Generate a list of dates from 2020-11 to 2022-12\n",
|
||||
"start_date = datetime(2020, 11, 1)\n",
|
||||
"end_date = datetime(2022, 12, 1)\n",
|
||||
"date_list = []\n",
|
||||
"\n",
|
||||
"# Fetch the list of files in the repository\n",
|
||||
"response = requests.get(api_url)\n",
|
||||
"if response.status_code != 200:\n",
|
||||
" raise Exception(f\"Failed to fetch repository contents: {response.status_code}\")\n",
|
||||
"\n",
|
||||
"files = [item for item in response.json() if item['name'].endswith('.parquet')]\n",
|
||||
"current_date = start_date\n",
|
||||
"while current_date <= end_date:\n",
|
||||
" date_list.append(current_date.strftime(\"%Y-%m\"))\n",
|
||||
" current_date += timedelta(days=32)\n",
|
||||
" current_date = current_date.replace(day=1)\n",
|
||||
"\n",
|
||||
"# Create a progress bar\n",
|
||||
"with tqdm(total=len(files), desc=\"Appending files\") as pbar:\n",
|
||||
" for file in files:\n",
|
||||
"with tqdm(total=len(date_list), desc=\"Appending files\") as pbar:\n",
|
||||
" for date_str in date_list:\n",
|
||||
" file_url = base_url.format(date_str)\n",
|
||||
" \n",
|
||||
" # Download the file content\n",
|
||||
" file_url = file['download_url']\n",
|
||||
" file_response = requests.get(file_url)\n",
|
||||
" if file_response.status_code != 200:\n",
|
||||
" print(f\"Failed to download {file['name']}: {file_response.status_code}\")\n",
|
||||
" print(f\"Failed to download {date_str}: {file_response.status_code}\")\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # Read the Parquet file from the response content\n",
|
||||
|
@ -201,7 +208,7 @@
|
|||
" table.append(df)\n",
|
||||
" \n",
|
||||
" pbar.update(1)\n",
|
||||
" pbar.set_postfix_str(f\"Appended {file['name']}\")\n",
|
||||
" pbar.set_postfix_str(f\"Appended {date_str}\")\n",
|
||||
"\n",
|
||||
"# Print the total number of rows in the table after appending all files\n",
|
||||
"print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")"
|
||||
|
|
Loading…
Reference in New Issue