|
124 | 124 | visit, |
125 | 125 | visit_with_partner, |
126 | 126 | ) |
127 | | -from pyiceberg.table import WriteTask |
| 127 | +from pyiceberg.table import PropertyUtil, TableProperties, WriteTask |
128 | 128 | from pyiceberg.table.name_mapping import NameMapping |
129 | 129 | from pyiceberg.transforms import TruncateTransform |
130 | 130 | from pyiceberg.typedef import EMPTY_DICT, Properties, Record |
@@ -1389,19 +1389,12 @@ class MetricModeTypes(Enum): |
1389 | 1389 | FULL = "full" |
1390 | 1390 |
|
1391 | 1391 |
|
1392 | | -DEFAULT_METRICS_MODE_KEY = "write.metadata.metrics.default" |
1393 | | -COLUMN_METRICS_MODE_KEY_PREFIX = "write.metadata.metrics.column" |
1394 | | - |
1395 | | - |
1396 | 1392 | @dataclass(frozen=True) |
1397 | 1393 | class MetricsMode(Singleton): |
1398 | 1394 | type: MetricModeTypes |
1399 | 1395 | length: Optional[int] = None |
1400 | 1396 |
|
1401 | 1397 |
|
1402 | | -_DEFAULT_METRICS_MODE = MetricsMode(MetricModeTypes.TRUNCATE, DEFAULT_TRUNCATION_LENGTH) |
1403 | | - |
1404 | | - |
1405 | 1398 | def match_metrics_mode(mode: str) -> MetricsMode: |
1406 | 1399 | sanitized_mode = mode.strip().lower() |
1407 | 1400 | if sanitized_mode.startswith("truncate"): |
@@ -1435,12 +1428,14 @@ class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector] |
1435 | 1428 | _field_id: int = 0 |
1436 | 1429 | _schema: Schema |
1437 | 1430 | _properties: Dict[str, str] |
1438 | | - _default_mode: Optional[str] |
| 1431 | + _default_mode: str |
1439 | 1432 |
|
1440 | 1433 | def __init__(self, schema: Schema, properties: Dict[str, str]): |
1441 | 1434 | self._schema = schema |
1442 | 1435 | self._properties = properties |
1443 | | - self._default_mode = self._properties.get(DEFAULT_METRICS_MODE_KEY) |
| 1436 | + self._default_mode = self._properties.get( |
| 1437 | + TableProperties.DEFAULT_WRITE_METRICS_MODE, TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT |
| 1438 | + ) |
1444 | 1439 |
|
1445 | 1440 | def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: |
1446 | 1441 | return struct_result() |
@@ -1475,12 +1470,9 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: |
1475 | 1470 | if column_name is None: |
1476 | 1471 | return [] |
1477 | 1472 |
|
1478 | | - metrics_mode = _DEFAULT_METRICS_MODE |
1479 | | - |
1480 | | - if self._default_mode: |
1481 | | - metrics_mode = match_metrics_mode(self._default_mode) |
| 1473 | + metrics_mode = match_metrics_mode(self._default_mode) |
1482 | 1474 |
|
1483 | | - col_mode = self._properties.get(f"{COLUMN_METRICS_MODE_KEY_PREFIX}.{column_name}") |
| 1475 | + col_mode = self._properties.get(f"{TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX}.{column_name}") |
1484 | 1476 | if col_mode: |
1485 | 1477 | metrics_mode = match_metrics_mode(col_mode) |
1486 | 1478 |
|
@@ -1767,33 +1759,40 @@ def write_file(table: Table, tasks: Iterator[WriteTask]) -> Iterator[DataFile]: |
1767 | 1759 | return iter([data_file]) |
1768 | 1760 |
|
1769 | 1761 |
|
1770 | | -def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: |
1771 | | - def _get_int(key: str, default: Optional[int] = None) -> Optional[int]: |
1772 | | - if value := table_properties.get(key): |
1773 | | - try: |
1774 | | - return int(value) |
1775 | | - except ValueError as e: |
1776 | | - raise ValueError(f"Could not parse table property {key} to an integer: {value}") from e |
1777 | | - else: |
1778 | | - return default |
| 1762 | +ICEBERG_UNCOMPRESSED_CODEC = "uncompressed" |
| 1763 | +PYARROW_UNCOMPRESSED_CODEC = "none" |
1779 | 1764 |
|
| 1765 | + |
| 1766 | +def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: |
1780 | 1767 | for key_pattern in [ |
1781 | | - "write.parquet.row-group-size-bytes", |
1782 | | - "write.parquet.page-row-limit", |
1783 | | - "write.parquet.bloom-filter-max-bytes", |
1784 | | - "write.parquet.bloom-filter-enabled.column.*", |
| 1768 | + TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, |
| 1769 | + TableProperties.PARQUET_PAGE_ROW_LIMIT, |
| 1770 | + TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES, |
| 1771 | + f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX}.*", |
1785 | 1772 | ]: |
1786 | 1773 | if unsupported_keys := fnmatch.filter(table_properties, key_pattern): |
1787 | 1774 | raise NotImplementedError(f"Parquet writer option(s) {unsupported_keys} not implemented") |
1788 | 1775 |
|
1789 | | - compression_codec = table_properties.get("write.parquet.compression-codec", "zstd") |
1790 | | - compression_level = _get_int("write.parquet.compression-level") |
1791 | | - if compression_codec == "uncompressed": |
1792 | | - compression_codec = "none" |
| 1776 | + compression_codec = table_properties.get(TableProperties.PARQUET_COMPRESSION, TableProperties.PARQUET_COMPRESSION_DEFAULT) |
| 1777 | + compression_level = PropertyUtil.property_as_int( |
| 1778 | + properties=table_properties, |
| 1779 | + property_name=TableProperties.PARQUET_COMPRESSION_LEVEL, |
| 1780 | + default=TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT, |
| 1781 | + ) |
| 1782 | + if compression_codec == ICEBERG_UNCOMPRESSED_CODEC: |
| 1783 | + compression_codec = PYARROW_UNCOMPRESSED_CODEC |
1793 | 1784 |
|
1794 | 1785 | return { |
1795 | 1786 | "compression": compression_codec, |
1796 | 1787 | "compression_level": compression_level, |
1797 | | - "data_page_size": _get_int("write.parquet.page-size-bytes"), |
1798 | | - "dictionary_pagesize_limit": _get_int("write.parquet.dict-size-bytes", default=2 * 1024 * 1024), |
| 1788 | + "data_page_size": PropertyUtil.property_as_int( |
| 1789 | + properties=table_properties, |
| 1790 | + property_name=TableProperties.PARQUET_PAGE_SIZE_BYTES, |
| 1791 | + default=TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT, |
| 1792 | + ), |
| 1793 | + "dictionary_pagesize_limit": PropertyUtil.property_as_int( |
| 1794 | + properties=table_properties, |
| 1795 | + property_name=TableProperties.PARQUET_DICT_SIZE_BYTES, |
| 1796 | + default=TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT, |
| 1797 | + ), |
1799 | 1798 | } |
0 commit comments