diff --git a/.env.example b/.env.example
new file mode 100644
index 000000000..e9d4eaf9c
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,5 @@
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=
+CATALOG_WAREHOUSE=s3://your-warehouse-bucket/
+d
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 2faf43d0a..1c7703c74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ override.tf.json
# Ignore CLI configuration files
.terraformrc
terraform.rc
+
+.env
diff --git a/LICENSE b/LICENSE
index 480ea6a7f..7e1f824e8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,10 @@
-MIT License
+Copyright (c) 2024 Sprint Labs, Inc.
-Copyright (c) 2024 Buster
+Portions of this software are licensed as follows:
+
+- All content that resides under the "ee/" and/or "web/src/ee" directories of this repository, if these directories exist, is licensed under the license defined in "ee/LICENSE".
+- All third party components incorporated into the Sprint Labs Software are licensed under the original license provided by the owner of the applicable component.
+- Content outside of the above mentioned directories or restrictions above is available under the "MIT Expat" license as defined below.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +22,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index dcf40b23b..9eefed61f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,74 @@
-# warehouse
-This is a warehouse built on Apache Iceberg and Starrocks as a query engine.
+
+
+
Buster Warehouse
+A data warehouse built on Apache Iceberg and Starrocks
+
+
+
+
+
+

+

+
+
+
+
+## Buster Warehouse Overview
+
+This project is a data warehouse built on Apache Iceberg and Starrocks. In working with our customers, we found that Snowflake, Bigquery, and other warehouse solutions were prohibitively expensive or slow in them being able to deploy AI-powered analytics at scale.
+
+Additionaly, we found that having a close integration between the data warehouse and our AI-native BI tool allows for a better and more reliable data experience.
+
+### Key Features
+
+- **Built on Starrocks:** We felt that Starrock was the best query engine by default for our use case. The main thing that pushed us towards it was that they perform predicate pushdown on iceberg tables, whereas Clickhouse and DuckDB do not. We were also impressed by the performance, caching system, and flexibility of Starrocks.
+- **Built on Apache Iceberg:** Some of the top companies in the world use Apache Iceberg for storing and interacting with their data. We wanted a table format that not only brought tremendous benefits, but one that companies wouldn't outgrow.
+- **Bring Your Own Storage:** We felt that customers should own their data and not be locked into a particular storage engine.
+
+## Quickstart
+Have
+
+
+## Roadmap
+
+Currently, we are in the process of open-sourcing the platform. This includes:
+
+- Warehouse Product (This Repo) ✅
+- BI platform (https://buster.so) ⏰
+
+After that, we will release an official roadmap.
+
+## How We Plan to Make Money
+
+Currently, we offer a few commercial products:
+- Cloud-Hosted Version
+ - Cluster
+ - Serverless
+- Managed Self-Hosted Version
+
+## Support and feedback
+
+You can contact us through either:
+
+- [Github Discussions](https://github.com/orgs/buster-so/discussions)
+- Email us at founders at buster dot com
+
+## License
+
+This repository is MIT licensed, except for the `ee` folders. See [LICENSE](LICENSE) for more details.
+
+## Shoutouts
+
+The documentation from the Starrocks, Iceberg, and PyIceberg team has been very helpful in building this project.
diff --git a/assets/image.png b/assets/image.png
new file mode 100644
index 000000000..329e4cee4
Binary files /dev/null and b/assets/image.png differ
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 000000000..6cd1e0c0d
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,114 @@
+version: "3"
+
+services:
+ starrocks-fe:
+ image: starrocks/fe-ubuntu:3.3-latest
+ env_file:
+ - .env
+ hostname: starrocks-fe
+ container_name: starrocks-fe
+ user: root
+ command: |
+ sh /opt/starrocks/fe/bin/start_fe.sh
+ ports:
+ - 8030:8030
+ - 9020:9020
+ - 9030:9030
+ networks:
+ iceberg_net:
+ environment:
+ - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+ - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+ - AWS_REGION=${AWS_REGION}
+ healthcheck:
+ test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW FRONTENDS\G" |grep "Alive: true"'
+ interval: 10s
+ timeout: 5s
+ retries: 3
+
+ starrocks-cn:
+ image: starrocks/cn-ubuntu:3.3-latest
+ command:
+ - /bin/bash
+ - -c
+ - |
+ ulimit -u 65535;
+ ulimit -n 65535;
+ echo "# Enable data cache" >> /opt/starrocks/cn/conf/cn.conf
+ echo "datacache_enable = true" >> /opt/starrocks/cn/conf/cn.conf
+ echo "datacache_mem_size = 80%" >> /opt/starrocks/cn/conf/cn.conf
+ echo "datacache_disk_size = 80%" >> /opt/starrocks/cn/conf/cn.conf
+ echo "datacache_auto_adjust_enable = true" >> /opt/starrocks/cn/conf/cn.conf
+ echo "starlet_use_star_cache = true" >> /opt/starrocks/cn/conf/cn.conf
+ echo "starlet_star_cache_disk_size_percent = 80" >> /opt/starrocks/cn/conf/cn.conf
+ echo "lake_compaction_stream_buffer_size_bytes = 5000000000" >> /opt/starrocks/cn/conf/cn.conf
+ sleep 15s
+ mysql --connect-timeout 2 -h starrocks-fe -P 9030 -u root -e "ALTER SYSTEM ADD BACKEND \"starrocks-cn:9050\";"
+ /opt/starrocks/cn/bin/start_cn.sh
+ ports:
+ - 8040:8040
+ hostname: starrocks-cn
+ container_name: starrocks-cn
+ user: root
+ depends_on:
+ - starrocks-fe
+ healthcheck:
+ test: 'mysql -u root -h starrocks-fe -P 9030 -e "SHOW BACKENDS\G" |grep "Alive: true"'
+ interval: 10s
+ timeout: 5s
+ retries: 3
+ networks:
+ iceberg_net:
+
+ postgres:
+ image: postgres:15
+ container_name: postgres
+ networks:
+ iceberg_net:
+ aliases:
+ - postgres
+ ports:
+ - 5432:5432
+ environment:
+ - POSTGRES_USER=postgres
+ - POSTGRES_PASSWORD=postgres
+ - POSTGRES_DB=postgres
+ volumes:
+ - postgres_data:/var/lib/postgresql/data
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U postgres"]
+ interval: 5s
+ timeout: 5s
+ retries: 5
+
+ rest:
+ image: tabulario/iceberg-rest:latest
+ env_file:
+ - .env
+ container_name: iceberg-rest
+ networks:
+ iceberg_net:
+ aliases:
+ - iceberg-rest
+ ports:
+ - 8181:8181
+ environment:
+ - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+ - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+ - AWS_REGION=${AWS_REGION}
+ - AWS_DEFAULT_REGION=${AWS_REGION}
+ - CATALOG_WAREHOUSE=${CATALOG_WAREHOUSE}
+ - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
+ - CATALOG_URI=jdbc:postgresql://postgres:5432/postgres
+ - CATALOG_JDBC_USER=postgres
+ - CATALOG_JDBC_PASSWORD=postgres
+ - CATALOG_JDBC_INITIALIZE=true
+ depends_on:
+ postgres:
+ condition: service_healthy
+
+networks:
+ iceberg_net:
+
+volumes:
+ postgres_data:
diff --git a/ee/LICENSE b/ee/LICENSE
new file mode 100644
index 000000000..b7e7acdcc
--- /dev/null
+++ b/ee/LICENSE
@@ -0,0 +1,37 @@
+Buster Enterprise License (the "Enterprise License")
+Copyright (c) 2024 Sprint Labs, Inc. dba Buster ('Buster')
+
+With regard to the Buster Software:
+This software and associated documentation files (the "Software") may only be
+used in production, if you (and any entity that you represent) have agreed to,
+and are in compliance with, the Buster Terms of Service, available
+at https://buster.com/terms (the "Enterprise Terms"), or other
+agreement governing the use of the Software, as agreed by you and Buster,
+and otherwise have a valid Buster Enterprise license.
+
+Subject to the foregoing sentence, you are free to
+modify this Software and publish patches to the Software. You agree that Buster
+and/or its licensors (as applicable) retain all right, title and interest in and
+to all such modifications and/or patches, and all such modifications and/or
+patches may only be used, copied, modified, displayed, distributed, or otherwise
+exploited with a valid Buster Enterprise license.
+
+Notwithstanding the foregoing, you may copy and modify
+the Software for development and testing purposes, without requiring a
+subscription. You agree that Buster and/or its licensors (as applicable) retain
+all right, title and interest in and to all such modifications. You are not
+granted any other rights beyond what is expressly stated herein. Subject to the
+foregoing, it is forbidden to copy, merge, publish, distribute, sublicense,
+and/or sell the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+For all third party components incorporated into the Buster Software, those
+components are licensed under the original license provided by the owner of the
+applicable component.
\ No newline at end of file
diff --git a/helm_charts/starrocks.yaml b/helm_values/starrocks.yaml
similarity index 100%
rename from helm_charts/starrocks.yaml
rename to helm_values/starrocks.yaml
diff --git a/python/populate_warehouse.ipynb b/python/populate_warehouse.ipynb
new file mode 100644
index 000000000..e90131c91
--- /dev/null
+++ b/python/populate_warehouse.ipynb
@@ -0,0 +1,232 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYICEBERG_MAX_WORKERS=300\n"
+ ]
+ }
+ ],
+ "source": [
+ "%env PYICEBERG_MAX_WORKERS=300"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install \"pyiceberg[s3fs]\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyiceberg.catalog.rest import RestCatalog"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "catalog = RestCatalog(\"public\", ** {\n",
+ " \"uri\": f\"http://localhost:8181\",\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "catalog.create_namespace(\"public\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyiceberg.schema import Schema\n",
+ "from pyiceberg.types import (\n",
+ " NestedField,\n",
+ " LongType,\n",
+ " TimestampType,\n",
+ " DoubleType,\n",
+ " StringType,\n",
+ ")\n",
+ "\n",
+ "schema = Schema(\n",
+ " NestedField(field_id=1, name=\"VendorID\", field_type=LongType(), required=False),\n",
+ " NestedField(field_id=2, name=\"tpep_pickup_datetime\", field_type=TimestampType(), required=False),\n",
+ " NestedField(field_id=3, name=\"tpep_dropoff_datetime\", field_type=TimestampType(), required=False),\n",
+ " NestedField(field_id=4, name=\"passenger_count\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=5, name=\"trip_distance\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=6, name=\"RatecodeID\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=7, name=\"store_and_fwd_flag\", field_type=StringType(), required=False),\n",
+ " NestedField(field_id=8, name=\"PULocationID\", field_type=LongType(), required=False),\n",
+ " NestedField(field_id=9, name=\"DOLocationID\", field_type=LongType(), required=False),\n",
+ " NestedField(field_id=10, name=\"payment_type\", field_type=LongType(), required=False),\n",
+ " NestedField(field_id=11, name=\"fare_amount\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=12, name=\"extra\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=13, name=\"mta_tax\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=14, name=\"tip_amount\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=15, name=\"tolls_amount\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=16, name=\"improvement_surcharge\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=17, name=\"total_amount\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=18, name=\"congestion_surcharge\", field_type=DoubleType(), required=False),\n",
+ " NestedField(field_id=19, name=\"airport_fee\", field_type=DoubleType(), required=False),\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyiceberg.partitioning import PartitionSpec, PartitionField\n",
+ "from pyiceberg.transforms import DayTransform, MonthTransform, YearTransform, BucketTransform\n",
+ "\n",
+ "partition_spec = PartitionSpec(\n",
+ " PartitionField(source_id=2, field_id=1001, transform=MonthTransform(), name=\"tpep_pickup_datetime_month\"),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pyiceberg.table.sorting import SortOrder, SortField\n",
+ "from pyiceberg.transforms import IdentityTransform\n",
+ "\n",
+ "sort_order = SortOrder(\n",
+ " SortField(source_id=4, transform=IdentityTransform())\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "table = catalog.create_table(\n",
+ " identifier=\"public.nyc_taxi\",\n",
+ " schema=schema,\n",
+ " partition_spec=partition_spec,\n",
+ " sort_order=sort_order,\n",
+ " properties={\n",
+ " \"write.format.default\": \"parquet\",\n",
+ " \"write.parquet.compression-codec\": \"zstd\",\n",
+ " \"write.target-file-size-bytes\": \"536870912\",\n",
+ " \"s3.connect-timeout\": \"10000\"\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total rows in the table: 73531304\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import requests\n",
+ "import io\n",
+ "import pyarrow.parquet as pq\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "# GitHub repository information\n",
+ "repo_owner = \"buster-so\"\n",
+ "repo_name = \"sample-data\"\n",
+ "folder_path = \"nyc_taxi\"\n",
+ "\n",
+ "# GitHub API endpoint to get repository contents\n",
+ "api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}\"\n",
+ "\n",
+ "# Fetch the list of files in the repository\n",
+ "response = requests.get(api_url)\n",
+ "if response.status_code != 200:\n",
+ " raise Exception(f\"Failed to fetch repository contents: {response.status_code}\")\n",
+ "\n",
+ "files = [item for item in response.json() if item['name'].endswith('.parquet')]\n",
+ "\n",
+ "# Create a progress bar\n",
+ "with tqdm(total=len(files), desc=\"Appending files\") as pbar:\n",
+ " for file in files:\n",
+ " # Download the file content\n",
+ " file_url = file['download_url']\n",
+ " file_response = requests.get(file_url)\n",
+ " if file_response.status_code != 200:\n",
+ " print(f\"Failed to download {file['name']}: {file_response.status_code}\")\n",
+ " continue\n",
+ " \n",
+ " # Read the Parquet file from the response content\n",
+ " file_content = io.BytesIO(file_response.content)\n",
+ " df = pq.read_table(file_content)\n",
+ " \n",
+ " # Append to the Iceberg table\n",
+ " table.append(df)\n",
+ " \n",
+ " pbar.update(1)\n",
+ " pbar.set_postfix_str(f\"Appended {file['name']}\")\n",
+ "\n",
+ "# Print the total number of rows in the table after appending all files\n",
+ "print(f\"Total rows in the table: {len(table.scan().to_arrow())}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}