{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: PYICEBERG_MAX_WORKERS=300\n" ] } ], "source": [ "%env PYICEBERG_MAX_WORKERS=300" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#!pip install \"pyiceberg[s3fs]\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from pyiceberg.catalog.rest import RestCatalog" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "catalog = RestCatalog(\"public\", ** {\n", " \"uri\": f\"http://localhost:8181\",\n", "})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "catalog.create_namespace(\"public\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from pyiceberg.schema import Schema\n", "from pyiceberg.types import (\n", " NestedField,\n", " LongType,\n", " TimestampType,\n", " DoubleType,\n", " StringType,\n", ")\n", "\n", "schema = Schema(\n", " NestedField(field_id=1, name=\"VendorID\", field_type=LongType(), required=False),\n", " NestedField(field_id=2, name=\"tpep_pickup_datetime\", field_type=TimestampType(), required=False),\n", " NestedField(field_id=3, name=\"tpep_dropoff_datetime\", field_type=TimestampType(), required=False),\n", " NestedField(field_id=4, name=\"passenger_count\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=5, name=\"trip_distance\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=6, name=\"RatecodeID\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=7, name=\"store_and_fwd_flag\", field_type=StringType(), required=False),\n", " NestedField(field_id=8, name=\"PULocationID\", field_type=LongType(), required=False),\n", " NestedField(field_id=9, name=\"DOLocationID\", field_type=LongType(), required=False),\n", " NestedField(field_id=10, name=\"payment_type\", field_type=LongType(), required=False),\n", " NestedField(field_id=11, name=\"fare_amount\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=12, name=\"extra\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=13, name=\"mta_tax\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=14, name=\"tip_amount\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=15, name=\"tolls_amount\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=16, name=\"improvement_surcharge\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=17, name=\"total_amount\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=18, name=\"congestion_surcharge\", field_type=DoubleType(), required=False),\n", " NestedField(field_id=19, name=\"airport_fee\", field_type=DoubleType(), required=False),\n", ")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from pyiceberg.partitioning import PartitionSpec, PartitionField\n", "from pyiceberg.transforms import DayTransform, MonthTransform, YearTransform, BucketTransform\n", "\n", "partition_spec = PartitionSpec(\n", " PartitionField(source_id=2, field_id=1001, transform=MonthTransform(), name=\"tpep_pickup_datetime_month\"),\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from pyiceberg.table.sorting import SortOrder, SortField\n", "from pyiceberg.transforms import IdentityTransform\n", "\n", "sort_order = SortOrder(\n", " SortField(source_id=4, transform=IdentityTransform())\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "table = catalog.create_table(\n", " identifier=\"public.nyc_taxi\",\n", " schema=schema,\n", " partition_spec=partition_spec,\n", " sort_order=sort_order,\n", " properties={\n", " \"write.format.default\": \"parquet\",\n", " \"write.parquet.compression-codec\": \"zstd\",\n", " \"write.target-file-size-bytes\": \"536870912\",\n", " \"s3.connect-timeout\": \"10000\"\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Appending files: 0%| | 0/26 [00:00