In [162]:
%env PYICEBERG_MAX_WORKERS=300

env: PYICEBERG_MAX_WORKERS=300


In [163]:
#!pip install "pyiceberg[s3fs]"

In [164]:
from pyiceberg.catalog.rest import RestCatalog

In [165]:
catalog = RestCatalog("public", ** {
 "uri": f"http://localhost:8181",
})

In [166]:
catalog.create_namespace("public")

In [167]:
from pyiceberg.schema import Schema
from pyiceberg.types import (
 NestedField,
 LongType,
 TimestampType,
 DoubleType,
 StringType,
)

schema = Schema(
 NestedField(field_id=1, name="VendorID", field_type=LongType(), required=False),
 NestedField(field_id=2, name="tpep_pickup_datetime", field_type=TimestampType(), required=False),
 NestedField(field_id=3, name="tpep_dropoff_datetime", field_type=TimestampType(), required=False),
 NestedField(field_id=4, name="passenger_count", field_type=DoubleType(), required=False),
 NestedField(field_id=5, name="trip_distance", field_type=DoubleType(), required=False),
 NestedField(field_id=6, name="RatecodeID", field_type=DoubleType(), required=False),
 NestedField(field_id=7, name="store_and_fwd_flag", field_type=StringType(), required=False),
 NestedField(field_id=8, name="PULocationID", field_type=LongType(), required=False),
 NestedField(field_id=9, name="DOLocationID", field_type=LongType(), required=False),
 NestedField(field_id=10, name="payment_type", field_type=LongType(), required=False),
 NestedField(field_id=11, name="fare_amount", field_type=DoubleType(), required=False),
 NestedField(field_id=12, name="extra", field_type=DoubleType(), required=False),
 NestedField(field_id=13, name="mta_tax", field_type=DoubleType(), required=False),
 NestedField(field_id=14, name="tip_amount", field_type=DoubleType(), required=False),
 NestedField(field_id=15, name="tolls_amount", field_type=DoubleType(), required=False),
 NestedField(field_id=16, name="improvement_surcharge", field_type=DoubleType(), required=False),
 NestedField(field_id=17, name="total_amount", field_type=DoubleType(), required=False),
 NestedField(field_id=18, name="congestion_surcharge", field_type=DoubleType(), required=False),
 NestedField(field_id=19, name="airport_fee", field_type=DoubleType(), required=False),
)


In [168]:
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform, MonthTransform, YearTransform, BucketTransform

partition_spec = PartitionSpec(
 PartitionField(source_id=2, field_id=1001, transform=MonthTransform(), name="tpep_pickup_datetime_month"),
)

In [169]:
from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform

sort_order = SortOrder(
 SortField(source_id=4, transform=IdentityTransform())
)

In [170]:
table = catalog.create_table(
 identifier="public.nyc_taxi",
 schema=schema,
 partition_spec=partition_spec,
 sort_order=sort_order,
 properties={
 "write.format.default": "parquet",
 "write.parquet.compression-codec": "zstd",
 "write.target-file-size-bytes": "536870912",
 "s3.connect-timeout": "10000"
 }
)

In [171]:
import os
import requests
import io
import pyarrow.parquet as pq
from tqdm import tqdm

# GitHub repository information
repo_owner = "buster-so"
repo_name = "sample-data"
folder_path = "nyc_taxi"

# GitHub API endpoint to get repository contents
api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}"

# Fetch the list of files in the repository
response = requests.get(api_url)
if response.status_code != 200:
 raise Exception(f"Failed to fetch repository contents: {response.status_code}")

files = [item for item in response.json() if item['name'].endswith('.parquet')]

# Create a progress bar
with tqdm(total=len(files), desc="Appending files") as pbar:
 for file in files:
 # Download the file content
 file_url = file['download_url']
 file_response = requests.get(file_url)
 if file_response.status_code != 200:
 print(f"Failed to download {file['name']}: {file_response.status_code}")
 continue
 
 # Read the Parquet file from the response content
 file_content = io.BytesIO(file_response.content)
 df = pq.read_table(file_content)
 
 # Append to the Iceberg table
 table.append(df)
 
 pbar.update(1)
 pbar.set_postfix_str(f"Appended {file['name']}")

# Print the total number of rows in the table after appending all files
print(f"Total rows in the table: {len(table.scan().to_arrow())}")

Appending files: 100%|██████████| 26/26 [10:19<00:00, 23.83s/it, Appended yellow_tripdata_2021-09.parquet]


Total rows in the table: 73531304
