From 34337f579289b5b2032a76153d6cdd8a9a6512ac Mon Sep 17 00:00:00 2001 From: HTHou Date: Thu, 9 Apr 2026 11:02:15 +0800 Subject: [PATCH] Format Python code with Spotless and Black --- python/examples/example.py | 17 +- python/pom.xml | 30 ++ python/requirements.txt | 2 +- python/setup.py | 26 +- .../tests/bench_batch_arrow_vs_dataframe.py | 27 +- .../tests/bench_write_arrow_vs_dataframe.py | 135 +++++---- python/tests/test_basic.py | 27 +- python/tests/test_batch_arrow.py | 133 ++++++--- python/tests/test_dataframe.py | 261 +++++++++------- python/tests/test_load_tsfile_from_iotdb.py | 46 +-- python/tests/test_query_by_row.py | 34 ++- python/tests/test_reader_metadata.py | 15 +- python/tests/test_tag_filter.py | 35 ++- python/tests/test_tag_filter_query.py | 130 +++++--- python/tests/test_to_tsfile.py | 211 +++++++------ python/tests/test_tsfile_dataset.py | 50 +++- python/tests/test_write.py | 43 +-- python/tests/test_write_and_read.py | 219 +++++++++----- python/tests/test_write_arrow.py | 281 +++++++++++------- python/tsfile/__init__.py | 15 +- python/tsfile/constants.py | 38 +-- python/tsfile/dataset/dataframe.py | 124 ++++++-- python/tsfile/dataset/formatting.py | 13 +- python/tsfile/dataset/merge.py | 27 +- python/tsfile/dataset/metadata.py | 46 ++- python/tsfile/dataset/reader.py | 85 ++++-- python/tsfile/dataset/timeseries.py | 26 +- python/tsfile/exceptions.py | 5 +- python/tsfile/field.py | 69 +++-- python/tsfile/row_record.py | 1 + python/tsfile/schema.py | 42 ++- python/tsfile/tablet.py | 53 +++- python/tsfile/tag_filter.py | 16 +- python/tsfile/tsfile_table_writer.py | 56 ++-- python/tsfile/utils.py | 199 +++++++------ 35 files changed, 1664 insertions(+), 873 deletions(-) diff --git a/python/examples/example.py b/python/examples/example.py index cd0e61e5b..13555014d 100644 --- a/python/examples/example.py +++ b/python/examples/example.py @@ -19,7 +19,14 @@ from tsfile import ColumnSchema, TableSchema from tsfile import Tablet -from tsfile import TsFileTableWriter, TsFileReader, TSDataType, TSEncoding, Compressor, ColumnCategory +from tsfile import ( + TsFileTableWriter, + TsFileReader, + TSDataType, + TSEncoding, + Compressor, + ColumnCategory, +) ## Write table_data_dir = os.path.join(os.path.dirname(__file__), "table_data.tsfile") @@ -36,9 +43,10 @@ with TsFileTableWriter(table_data_dir, table_schema) as writer: tablet_row_num = 100 tablet = Tablet( - ["id", "id2", "value"], - [TSDataType.STRING, TSDataType.STRING, TSDataType.FLOAT], - tablet_row_num) + ["id", "id2", "value"], + [TSDataType.STRING, TSDataType.STRING, TSDataType.FLOAT], + tablet_row_num, + ) for i in range(tablet_row_num): tablet.add_timestamp(i, i * 10) @@ -57,4 +65,3 @@ print(result.get_value_by_name("id2")) print(result.get_value_by_name("value")) print(result.read_data_frame()) - diff --git a/python/pom.xml b/python/pom.xml index 7a39fc7aa..a728a5f04 100644 --- a/python/pom.xml +++ b/python/pom.xml @@ -36,6 +36,36 @@ ${project.basedir} + + com.diffplug.spotless + spotless-maven-plugin + ${spotless.version} + + + + examples/**/*.py + tests/**/*.py + tsfile/**/*.py + setup.py + + + 24.10.0 + ${project.basedir}/${python.venv.bin}black + + + UNIX + ${spotless.skip} + + + + spotless-check + + check + + validate + + + org.codehaus.mojo exec-maven-plugin diff --git a/python/requirements.txt b/python/requirements.txt index 9ee650389..e4cf71a2a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -18,9 +18,9 @@ # cython==3.0.10 +black==24.10.0 numpy>=2.0.0,<3 pandas==2.2.2 setuptools==78.1.1 wheel==0.46.2 pyarrow>=8.0.0 - diff --git a/python/setup.py b/python/setup.py index 16fc7aa84..63d99de03 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,9 @@ shutil.rmtree(PKG / "include") shutil.copytree(CPP_INC, PKG / "include") if sys.platform.startswith("linux"): - candidates = sorted(CPP_LIB.glob("libtsfile.so*"), key=lambda p: len(p.name), reverse=True) + candidates = sorted( + CPP_LIB.glob("libtsfile.so*"), key=lambda p: len(p.name), reverse=True + ) if not candidates: raise FileNotFoundError("missing libtsfile.so* in build output") src = candidates[0] @@ -51,7 +53,9 @@ shutil.copy2(src, link_name) elif sys.platform == "darwin": - candidates = sorted(CPP_LIB.glob("libtsfile.*.dylib")) or list(CPP_LIB.glob("libtsfile.dylib")) + candidates = sorted(CPP_LIB.glob("libtsfile.*.dylib")) or list( + CPP_LIB.glob("libtsfile.dylib") + ) if not candidates: raise FileNotFoundError("missing libtsfile*.dylib in build output") src = candidates[0] @@ -61,8 +65,12 @@ shutil.copy2(src, link_name) elif sys.platform == "win32": for base_name in ("libtsfile",): - dll_candidates = sorted(CPP_LIB.glob(f"{base_name}*.dll"), key=lambda p: len(p.name), reverse=True) - dll_a_candidates = sorted(CPP_LIB.glob(f"{base_name}*.dll.a"), key=lambda p: len(p.name), reverse=True) + dll_candidates = sorted( + CPP_LIB.glob(f"{base_name}*.dll"), key=lambda p: len(p.name), reverse=True + ) + dll_a_candidates = sorted( + CPP_LIB.glob(f"{base_name}*.dll.a"), key=lambda p: len(p.name), reverse=True + ) if not dll_candidates: raise FileNotFoundError(f"missing {base_name}*.dll in build output") @@ -119,8 +127,14 @@ def finalize_options(self): extra_link_args += ["-Wl,-rpath,@loader_path", "-stdlib=libc++"] elif sys.platform == "win32": libraries = ["tsfile"] - extra_compile_args += ["-O2", "-std=c++11", "-DSIZEOF_VOID_P=8", "-D__USE_MINGW_ANSI_STDIO=1", "-DMS_WIN64", - "-D_WIN64"] + extra_compile_args += [ + "-O2", + "-std=c++11", + "-DSIZEOF_VOID_P=8", + "-D__USE_MINGW_ANSI_STDIO=1", + "-DMS_WIN64", + "-D_WIN64", + ] extra_link_args += [] else: raise RuntimeError(f"Unsupported platform: {sys.platform}") diff --git a/python/tests/bench_batch_arrow_vs_dataframe.py b/python/tests/bench_batch_arrow_vs_dataframe.py index 0e34347f2..1c690cbfe 100644 --- a/python/tests/bench_batch_arrow_vs_dataframe.py +++ b/python/tests/bench_batch_arrow_vs_dataframe.py @@ -61,12 +61,15 @@ def _ensure_bench_tsfile(file_path: str, row_count: int) -> None: remove(file_path) # Build data with pandas/numpy (vectorized, much faster than row-by-row Tablet) import numpy as np - df = pd.DataFrame({ - "time": np.arange(row_count, dtype=np.int64), - "device": pd.Series([f"device" for i in range(row_count)]), - "value1": np.arange(0, row_count * 10, 10, dtype=np.int64), - "value2": np.arange(row_count, dtype=np.float64) * 1.5, - }) + + df = pd.DataFrame( + { + "time": np.arange(row_count, dtype=np.int64), + "device": pd.Series([f"device" for i in range(row_count)]), + "value1": np.arange(0, row_count * 10, 10, dtype=np.int64), + "value2": np.arange(row_count, dtype=np.float64) * 1.5, + } + ) table = TableSchema( TABLE_NAME, @@ -135,7 +138,9 @@ def _run_timed(name: str, func, *args, rounds: int = DEFAULT_TIMED_ROUNDS): avg = sum(times) / len(times) total_rows = n rows_per_sec = total_rows / avg if avg > 0 else 0 - print(f" {name}: {avg:.3f}s avg ({min(times):.3f}s min) rows={total_rows} {rows_per_sec:.0f} rows/s") + print( + f" {name}: {avg:.3f}s avg ({min(times):.3f}s min) rows={total_rows} {rows_per_sec:.0f} rows/s" + ) return avg, total_rows @@ -148,7 +153,9 @@ def run_benchmark( _ensure_bench_tsfile(file_path, row_count) end_time = row_count + 1 - print(f"Benchmark: {row_count} rows, batch_size={batch_size}, timed_rounds={timed_rounds}") + print( + f"Benchmark: {row_count} rows, batch_size={batch_size}, timed_rounds={timed_rounds}" + ) df_avg, df_rows = _run_timed( "query_table + read_data_frame", @@ -170,7 +177,9 @@ def run_benchmark( print() if df_avg > 0: speedup = arrow_avg / df_avg - print(f" Arrow vs DataFrame time ratio: {speedup:.2f}x ({'Arrow faster' if speedup < 1 else 'DataFrame faster'})") + print( + f" Arrow vs DataFrame time ratio: {speedup:.2f}x ({'Arrow faster' if speedup < 1 else 'DataFrame faster'})" + ) assert df_rows == row_count, f"DataFrame path row count {df_rows} != {row_count}" assert arrow_rows == row_count, f"Arrow path row count {arrow_rows} != {row_count}" diff --git a/python/tests/bench_write_arrow_vs_dataframe.py b/python/tests/bench_write_arrow_vs_dataframe.py index c2f9bedcd..ffc59a42c 100644 --- a/python/tests/bench_write_arrow_vs_dataframe.py +++ b/python/tests/bench_write_arrow_vs_dataframe.py @@ -48,32 +48,36 @@ # Config # --------------------------------------------------------------------------- -DEFAULT_ROW_COUNT = 100_000 -DEFAULT_BATCH_SIZE = 8_192 -DEFAULT_ROUNDS = 3 +DEFAULT_ROW_COUNT = 100_000 +DEFAULT_BATCH_SIZE = 8_192 +DEFAULT_ROUNDS = 3 TABLE_NAME = "bench_table" BENCH_FILE = "bench_write_arrow.tsfile" -SCHEMA = TableSchema(TABLE_NAME, [ - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), -]) +SCHEMA = TableSchema( + TABLE_NAME, + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), + ], +) # --------------------------------------------------------------------------- # Data generation # --------------------------------------------------------------------------- + def _make_numpy_data(row_count: int): - ts = np.arange(row_count, dtype="int64") - v_i64 = np.arange(row_count, dtype="int64") - v_f64 = np.arange(row_count, dtype="float64") * 1.5 - v_bool = (np.arange(row_count) % 2 == 0) - v_str = [f"s{i}" for i in range(row_count)] + ts = np.arange(row_count, dtype="int64") + v_i64 = np.arange(row_count, dtype="int64") + v_f64 = np.arange(row_count, dtype="float64") * 1.5 + v_bool = np.arange(row_count) % 2 == 0 + v_str = [f"s{i}" for i in range(row_count)] device = ["device0"] * row_count return ts, device, v_i64, v_f64, v_bool, v_str @@ -83,14 +87,18 @@ def _make_arrow_batches(row_count: int, batch_size: int): batches = [] for start in range(0, row_count, batch_size): end = min(start + batch_size, row_count) - batches.append(pa.record_batch({ - "time": pa.array(ts[start:end], type=pa.timestamp("ns")), - "device": pa.array(device[start:end], type=pa.string()), - "v_i64": pa.array(v_i64[start:end], type=pa.int64()), - "v_f64": pa.array(v_f64[start:end], type=pa.float64()), - "v_bool": pa.array(v_bool[start:end], type=pa.bool_()), - "v_str": pa.array(v_str[start:end], type=pa.string()), - })) + batches.append( + pa.record_batch( + { + "time": pa.array(ts[start:end], type=pa.timestamp("ns")), + "device": pa.array(device[start:end], type=pa.string()), + "v_i64": pa.array(v_i64[start:end], type=pa.int64()), + "v_f64": pa.array(v_f64[start:end], type=pa.float64()), + "v_bool": pa.array(v_bool[start:end], type=pa.bool_()), + "v_str": pa.array(v_str[start:end], type=pa.string()), + } + ) + ) return batches @@ -99,14 +107,18 @@ def _make_dataframe_chunks(row_count: int, batch_size: int): chunks = [] for start in range(0, row_count, batch_size): end = min(start + batch_size, row_count) - chunks.append(pd.DataFrame({ - "time": pd.Series(ts[start:end], dtype="int64"), - "device": device[start:end], - "v_i64": pd.Series(v_i64[start:end], dtype="int64"), - "v_f64": pd.Series(v_f64[start:end], dtype="float64"), - "v_bool": pd.Series(v_bool[start:end], dtype="bool"), - "v_str": v_str[start:end], - })) + chunks.append( + pd.DataFrame( + { + "time": pd.Series(ts[start:end], dtype="int64"), + "device": device[start:end], + "v_i64": pd.Series(v_i64[start:end], dtype="int64"), + "v_f64": pd.Series(v_f64[start:end], dtype="float64"), + "v_bool": pd.Series(v_bool[start:end], dtype="bool"), + "v_str": v_str[start:end], + } + ) + ) return chunks @@ -114,33 +126,42 @@ def _make_dataframe_chunks(row_count: int, batch_size: int): # Benchmark runners # --------------------------------------------------------------------------- + def _write_arrow(file_path: str, batches): - schema = TableSchema(TABLE_NAME, [ - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), - ]) + schema = TableSchema( + TABLE_NAME, + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), + ], + ) with TsFileTableWriter(file_path, schema) as w: for batch in batches: w.write_arrow_batch(batch) def _write_dataframe(file_path: str, chunks): - schema = TableSchema(TABLE_NAME, [ - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), - ]) + schema = TableSchema( + TABLE_NAME, + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("v_i64", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), + ], + ) with TsFileTableWriter(file_path, schema) as w: for chunk in chunks: w.write_dataframe(chunk) -def _run_timed(label: str, func, *args, rounds: int = DEFAULT_ROUNDS, row_count: int = 0): +def _run_timed( + label: str, func, *args, rounds: int = DEFAULT_ROUNDS, row_count: int = 0 +): times = [] for _ in range(rounds): if os.path.exists(BENCH_FILE): @@ -159,27 +180,34 @@ def _run_timed(label: str, func, *args, rounds: int = DEFAULT_ROUNDS, row_count: # Main benchmark # --------------------------------------------------------------------------- + def run_benchmark( row_count: int = DEFAULT_ROW_COUNT, batch_size: int = DEFAULT_BATCH_SIZE, rounds: int = DEFAULT_ROUNDS, ): print() - print(f"=== write benchmark: {row_count:,} rows, batch_size={batch_size}, rounds={rounds} ===") + print( + f"=== write benchmark: {row_count:,} rows, batch_size={batch_size}, rounds={rounds} ===" + ) # Pre-build data once (exclude data-preparation time from timing) arrow_batches = _make_arrow_batches(row_count, batch_size) - df_chunks = _make_dataframe_chunks(row_count, batch_size) + df_chunks = _make_dataframe_chunks(row_count, batch_size) df_avg = _run_timed( "write_dataframe", - _write_dataframe, df_chunks, - rounds=rounds, row_count=row_count, + _write_dataframe, + df_chunks, + rounds=rounds, + row_count=row_count, ) arrow_avg = _run_timed( "write_arrow_batch", - _write_arrow, arrow_batches, - rounds=rounds, row_count=row_count, + _write_arrow, + arrow_batches, + rounds=rounds, + row_count=row_count, ) print() @@ -201,6 +229,7 @@ def run_benchmark( # Pytest entry points # --------------------------------------------------------------------------- + def test_bench_write_arrow_small(): """Quick sanity check with small data (5 k rows).""" run_benchmark(row_count=5_000, batch_size=1_024, rounds=2) @@ -225,6 +254,6 @@ def test_bench_write_arrow_large(): # --------------------------------------------------------------------------- if __name__ == "__main__": - row_count = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_ROW_COUNT + row_count = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_ROW_COUNT batch_size = int(sys.argv[2]) if len(sys.argv) > 2 else DEFAULT_BATCH_SIZE run_benchmark(row_count=row_count, batch_size=batch_size) diff --git a/python/tests/test_basic.py b/python/tests/test_basic.py index 675ef837f..af90d3c61 100644 --- a/python/tests/test_basic.py +++ b/python/tests/test_basic.py @@ -25,7 +25,13 @@ def test_tablet(): column_names = ["temp1", "temp2", "value1", "value2", "string1"] - data_types = [TSDataType.INT32, TSDataType.INT64, TSDataType.FLOAT, TSDataType.DOUBLE, TSDataType.STRING] + data_types = [ + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + TSDataType.STRING, + ] tablet = Tablet(column_names, data_types) tablet.set_table_name("test") @@ -62,13 +68,14 @@ def test_tablet(): assert 0.1 == tablet.get_value_list_by_name("value1")[0] assert np.iinfo(np.int32).max == tablet.get_value_list_by_name("temp1")[30] + def test_field(): - field_int32 = Field("int32",10, TSDataType.INT32) + field_int32 = Field("int32", 10, TSDataType.INT32) field_int64 = Field("int64", np.iinfo(np.int32).max + 1, TSDataType.INT64) - field_float = Field("float",10.0, TSDataType.FLOAT) - field_double = Field("double",10.0, TSDataType.DOUBLE) - field_bool = Field("bool",True, TSDataType.BOOLEAN) - field = Field("t",100) + field_float = Field("float", 10.0, TSDataType.FLOAT) + field_double = Field("double", 10.0, TSDataType.DOUBLE) + field_bool = Field("bool", True, TSDataType.BOOLEAN) + field = Field("t", 100) assert 100 == field.get_value() assert np.int64(10) == field_int32.get_long_value() @@ -90,6 +97,7 @@ def test_field(): with pytest.raises(OverflowError): field_int64.get_int_value() + def test_schema(): column1 = ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG) column2 = ColumnSchema("sensor", TSDataType.STRING, ColumnCategory.TAG) @@ -108,12 +116,7 @@ def test_schema(): tablet = TableSchema("test_table", []) with pytest.raises(ValueError): - column = ColumnSchema("test_column",None, ColumnCategory.TAG) + column = ColumnSchema("test_column", None, ColumnCategory.TAG) with pytest.raises(ValueError): tablet = TableSchema("test_table", [ColumnSchema("", TSDataType.DOUBLE)]) - - - - - diff --git a/python/tests/test_batch_arrow.py b/python/tests/test_batch_arrow.py index 9f1443896..70911efe0 100644 --- a/python/tests/test_batch_arrow.py +++ b/python/tests/test_batch_arrow.py @@ -38,7 +38,7 @@ def test_batch_read_arrow_basic(): ColumnSchema("value2", TSDataType.DOUBLE, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) @@ -60,7 +60,7 @@ def test_batch_read_arrow_basic(): import pyarrow as pa except ImportError: pytest.skip("pyarrow is not installed") - + reader = TsFileReader(file_path) result_set = reader.query_table( table_name="test_table", @@ -69,14 +69,14 @@ def test_batch_read_arrow_basic(): end_time=1000, batch_size=256, ) - + total_rows = 0 batch_count = 0 while True: table = result_set.read_arrow_batch() if table is None: break - + batch_count += 1 assert isinstance(table, pa.Table) assert len(table) > 0 @@ -87,13 +87,13 @@ def test_batch_read_arrow_basic(): assert "device" in column_names assert "value1" in column_names assert "value2" in column_names - + assert total_rows == 1000 assert batch_count > 0 - + result_set.close() reader.close() - + finally: if os.path.exists(file_path): os.remove(file_path) @@ -110,7 +110,7 @@ def test_batch_read_arrow_compare_with_dataframe(): ColumnSchema("value3", TSDataType.BOOLEAN, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) @@ -118,7 +118,12 @@ def test_batch_read_arrow_compare_with_dataframe(): with TsFileTableWriter(file_path, table) as writer: tablet = Tablet( ["device", "value1", "value2", "value3"], - [TSDataType.STRING, TSDataType.INT32, TSDataType.FLOAT, TSDataType.BOOLEAN], + [ + TSDataType.STRING, + TSDataType.INT32, + TSDataType.FLOAT, + TSDataType.BOOLEAN, + ], 500, ) for i in range(500): @@ -128,7 +133,7 @@ def test_batch_read_arrow_compare_with_dataframe(): tablet.add_value_by_name("value2", i, i * 1.1) tablet.add_value_by_name("value3", i, i % 2 == 0) writer.write_table(tablet) - + try: import pyarrow as pa except ImportError: @@ -142,7 +147,7 @@ def test_batch_read_arrow_compare_with_dataframe(): end_time=500, batch_size=100, ) - + arrow_tables = [] while True: table = result_set1.read_arrow_batch() @@ -155,7 +160,7 @@ def test_batch_read_arrow_compare_with_dataframe(): df_arrow = combined_arrow_table.to_pandas() else: df_arrow = pd.DataFrame() - + result_set1.close() reader1.close() reader2 = TsFileReader(file_path) @@ -165,7 +170,7 @@ def test_batch_read_arrow_compare_with_dataframe(): start_time=0, end_time=500, ) - + df_traditional = result_set2.read_data_frame(max_row_num=1000) result_set2.close() reader2.close() @@ -178,15 +183,34 @@ def test_batch_read_arrow_compare_with_dataframe(): assert col in df_traditional.columns df_arrow_sorted = df_arrow.sort_values("time").reset_index(drop=True) - df_traditional_sorted = df_traditional.sort_values("time").reset_index(drop=True) - + df_traditional_sorted = df_traditional.sort_values("time").reset_index( + drop=True + ) + for i in range(len(df_arrow_sorted)): - assert df_arrow_sorted.iloc[i]["time"] == df_traditional_sorted.iloc[i]["time"] - assert df_arrow_sorted.iloc[i]["device"] == df_traditional_sorted.iloc[i]["device"] - assert df_arrow_sorted.iloc[i]["value1"] == df_traditional_sorted.iloc[i]["value1"] - assert abs(df_arrow_sorted.iloc[i]["value2"] - df_traditional_sorted.iloc[i]["value2"]) < 1e-5 - assert df_arrow_sorted.iloc[i]["value3"] == df_traditional_sorted.iloc[i]["value3"] - + assert ( + df_arrow_sorted.iloc[i]["time"] == df_traditional_sorted.iloc[i]["time"] + ) + assert ( + df_arrow_sorted.iloc[i]["device"] + == df_traditional_sorted.iloc[i]["device"] + ) + assert ( + df_arrow_sorted.iloc[i]["value1"] + == df_traditional_sorted.iloc[i]["value1"] + ) + assert ( + abs( + df_arrow_sorted.iloc[i]["value2"] + - df_traditional_sorted.iloc[i]["value2"] + ) + < 1e-5 + ) + assert ( + df_arrow_sorted.iloc[i]["value3"] + == df_traditional_sorted.iloc[i]["value3"] + ) + finally: if os.path.exists(file_path): os.remove(file_path) @@ -201,7 +225,7 @@ def test_batch_read_arrow_empty_result(): ColumnSchema("value", TSDataType.INT64, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) @@ -217,7 +241,7 @@ def test_batch_read_arrow_empty_result(): tablet.add_value_by_name("device", i, f"device_{i}") tablet.add_value_by_name("value", i, i) writer.write_table(tablet) - + try: import pyarrow as pa except ImportError: @@ -234,10 +258,10 @@ def test_batch_read_arrow_empty_result(): table = result_set.read_arrow_batch() assert table is None - + result_set.close() reader.close() - + finally: if os.path.exists(file_path): os.remove(file_path) @@ -253,7 +277,7 @@ def test_batch_read_arrow_time_range(): ColumnSchema("value", TSDataType.INT64, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) @@ -269,7 +293,7 @@ def test_batch_read_arrow_time_range(): tablet.add_value_by_name("device", i, f"device_{i}") tablet.add_value_by_name("value", i, i) writer.write_table(tablet) - + try: import pyarrow as pa except ImportError: @@ -283,7 +307,7 @@ def test_batch_read_arrow_time_range(): end_time=199, batch_size=50, ) - + total_rows = 0 while True: table = result_set.read_arrow_batch() @@ -293,12 +317,12 @@ def test_batch_read_arrow_time_range(): df = table.to_pandas() assert df["time"].min() >= 100 assert df["time"].max() <= 199 - + assert total_rows == 100 - + result_set.close() reader.close() - + finally: if os.path.exists(file_path): os.remove(file_path) @@ -319,14 +343,23 @@ def test_batch_read_arrow_all_datatypes(): ColumnSchema("date_val", TSDataType.DATE, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) with TsFileTableWriter(file_path, table) as writer: tablet = Tablet( - ["device", "bool_val", "int32_val", "int64_val", "float_val", "double_val", "string_val", "date_val"], + [ + "device", + "bool_val", + "int32_val", + "int64_val", + "float_val", + "double_val", + "string_val", + "date_val", + ], [ TSDataType.STRING, TSDataType.BOOLEAN, @@ -350,7 +383,7 @@ def test_batch_read_arrow_all_datatypes(): tablet.add_value_by_name("string_val", i, f"string_{i}") tablet.add_value_by_name("date_val", i, date(2025, 1, (i % 28) + 1)) writer.write_table(tablet) - + try: import pyarrow as pa except ImportError: @@ -359,18 +392,27 @@ def test_batch_read_arrow_all_datatypes(): reader = TsFileReader(file_path) result_set = reader.query_table( table_name="test_table", - column_names=["device", "bool_val", "int32_val", "int64_val", "float_val", "double_val", "string_val", "date_val"], + column_names=[ + "device", + "bool_val", + "int32_val", + "int64_val", + "float_val", + "double_val", + "string_val", + "date_val", + ], start_time=0, end_time=200, batch_size=50, ) - + total_rows = 0 while True: table = result_set.read_arrow_batch() if table is None: break - + total_rows += len(table) df = table.to_pandas() @@ -383,12 +425,12 @@ def test_batch_read_arrow_all_datatypes(): assert "double_val" in df.columns assert "string_val" in df.columns assert "date_val" in df.columns - + assert total_rows == 200 - + result_set.close() reader.close() - + finally: if os.path.exists(file_path): os.remove(file_path) @@ -403,7 +445,7 @@ def test_batch_read_arrow_no_pyarrow(): ColumnSchema("value", TSDataType.INT64, ColumnCategory.FIELD), ], ) - + try: if os.path.exists(file_path): os.remove(file_path) @@ -419,7 +461,7 @@ def test_batch_read_arrow_no_pyarrow(): tablet.add_value_by_name("device", i, f"device_{i}") tablet.add_value_by_name("value", i, i) writer.write_table(tablet) - + reader = TsFileReader(file_path) result_set = reader.query_table( table_name="test_table", @@ -430,7 +472,7 @@ def test_batch_read_arrow_no_pyarrow(): ) result_set.close() reader.close() - + finally: if os.path.exists(file_path): os.remove(file_path) @@ -438,7 +480,4 @@ def test_batch_read_arrow_no_pyarrow(): if __name__ == "__main__": os.chdir(os.path.dirname(os.path.abspath(__file__))) - pytest.main([ - "test_batch_arrow.py", - "-s", "-v" - ]) + pytest.main(["test_batch_arrow.py", "-s", "-v"]) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e40ff32a0..e376add7a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -26,7 +26,10 @@ from tsfile import TsFileTableWriter, ColumnCategory from tsfile import to_dataframe from tsfile.exceptions import ColumnNotExistError, TypeMismatchError -from tsfile.tsfile_table_writer import validate_dataframe_for_tsfile, infer_object_column_type +from tsfile.tsfile_table_writer import ( + validate_dataframe_for_tsfile, + infer_object_column_type, +) def convert_to_nullable_types(df): @@ -38,16 +41,16 @@ def convert_to_nullable_types(df): df = df.copy() for col in df.columns: dtype = df[col].dtype - if dtype == 'int64': - df[col] = df[col].astype('Int64') - elif dtype == 'int32': - df[col] = df[col].astype('Int32') - elif dtype == 'float64': - df[col] = df[col].astype('Float64') - elif dtype == 'float32': - df[col] = df[col].astype('Float32') - elif dtype == 'bool': - df[col] = df[col].astype('boolean') + if dtype == "int64": + df[col] = df[col].astype("Int64") + elif dtype == "int32": + df[col] = df[col].astype("Int32") + elif dtype == "float64": + df[col] = df[col].astype("Float64") + elif dtype == "float32": + df[col] = df[col].astype("Float32") + elif dtype == "bool": + df[col] = df[col].astype("boolean") return df @@ -60,27 +63,35 @@ def test_infer_object_column_type_bool(): def test_write_dataframe_basic(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("value2", TSDataType.INT64, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("value2", TSDataType.INT64, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_basic.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [i for i in range(100)], - 'device': [f"device{i}" for i in range(100)], - 'value': [i * 1.5 for i in range(100)], - 'value2': [i * 10 for i in range(100)] - }) + df = pd.DataFrame( + { + "time": [i for i in range(100)], + "device": [f"device{i}" for i in range(100)], + "value": [i * 1.5 for i in range(100)], + "value2": [i * 10 for i in range(100)], + } + ) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (100, 4) assert df_read[TIME_COLUMN].equals(df_sorted["time"]) assert df_read["device"].equals(df_sorted["device"]) @@ -92,26 +103,32 @@ def test_write_dataframe_basic(): def test_write_dataframe_with_index(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_index.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'device': [f"device{i}" for i in range(50)], - 'value': [i * 2.5 for i in range(50)] - }) + df = pd.DataFrame( + { + "device": [f"device{i}" for i in range(50)], + "value": [i * 2.5 for i in range(50)], + } + ) df.index = [i * 10 for i in range(50)] # Set index as timestamps writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = df.sort_index() df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) - time_series = pd.Series(df.sort_index().index.values, dtype='Int64') + time_series = pd.Series(df.sort_index().index.values, dtype="Int64") assert df_read.shape == (50, 3) assert df_read[TIME_COLUMN].equals(time_series) assert df_read["device"].equals(df_sorted["device"]) @@ -122,25 +139,33 @@ def test_write_dataframe_with_index(): def test_write_dataframe_case_insensitive(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_case.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'Time': [i for i in range(30)], # Capital T - 'Device': [f"device{i}" for i in range(30)], # Capital D - 'VALUE': [i * 3.0 for i in range(30)] # All caps - }) + df = pd.DataFrame( + { + "Time": [i for i in range(30)], # Capital T + "Device": [f"device{i}" for i in range(30)], # Capital D + "VALUE": [i * 3.0 for i in range(30)], # All caps + } + ) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('Time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("Time").reset_index(drop=True) + ) assert df_read.shape == (30, 3) assert df_read[TIME_COLUMN].equals(df_sorted["Time"]) assert df_read["device"].equals(df_sorted["Device"]) @@ -151,21 +176,27 @@ def test_write_dataframe_case_insensitive(): def test_write_dataframe_column_not_in_schema(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_extra_col.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [i for i in range(10)], - 'device': [f"device{i}" for i in range(10)], - 'value': [i * 1.0 for i in range(10)], - 'extra_column': [i for i in range(10)] # Not in schema - }) + df = pd.DataFrame( + { + "time": [i for i in range(10)], + "device": [f"device{i}" for i in range(10)], + "value": [i * 1.0 for i in range(10)], + "extra_column": [i for i in range(10)], # Not in schema + } + ) with pytest.raises(ColumnNotExistError): writer.write_dataframe(df) finally: @@ -174,18 +205,18 @@ def test_write_dataframe_column_not_in_schema(): def test_write_dataframe_type_mismatch(): - table = TableSchema("test_table", - [ColumnSchema("value", TSDataType.STRING, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", [ColumnSchema("value", TSDataType.STRING, ColumnCategory.FIELD)] + ) tsfile_path = "test_write_dataframe_type_mismatch.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [i for i in range(10)], - 'value': [i for i in range(10)] - }) + df = pd.DataFrame( + {"time": [i for i in range(10)], "value": [i for i in range(10)]} + ) with pytest.raises(TypeMismatchError) as exc_info: writer.write_dataframe(df) finally: @@ -194,41 +225,51 @@ def test_write_dataframe_type_mismatch(): def test_write_dataframe_all_datatypes(): - table = TableSchema("test_table", - [ColumnSchema("bool_col", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("int32_col", TSDataType.INT32, ColumnCategory.FIELD), - ColumnSchema("int64_col", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), - ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("string_col", TSDataType.STRING, ColumnCategory.FIELD), - ColumnSchema("blob_col", TSDataType.BLOB, ColumnCategory.FIELD), - ColumnSchema("text_col", TSDataType.TEXT, ColumnCategory.FIELD), - ColumnSchema("date_col", TSDataType.DATE, ColumnCategory.FIELD), - ColumnSchema("timestamp_col", TSDataType.TIMESTAMP, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("bool_col", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("int32_col", TSDataType.INT32, ColumnCategory.FIELD), + ColumnSchema("int64_col", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), + ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("string_col", TSDataType.STRING, ColumnCategory.FIELD), + ColumnSchema("blob_col", TSDataType.BLOB, ColumnCategory.FIELD), + ColumnSchema("text_col", TSDataType.TEXT, ColumnCategory.FIELD), + ColumnSchema("date_col", TSDataType.DATE, ColumnCategory.FIELD), + ColumnSchema("timestamp_col", TSDataType.TIMESTAMP, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_all_types.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [i for i in range(50)], - 'bool_col': [i % 2 == 0 for i in range(50)], - 'int32_col': pd.Series([i for i in range(50)], dtype='int32'), - 'int64_col': [i * 10 for i in range(50)], - 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), - 'double_col': [i * 2.5 for i in range(50)], - 'string_col': [f"str{i}" for i in range(50)], - 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)], - 'text_col': [f"text{i}" for i in range(50)], - 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], - 'timestamp_col': [i for i in range(50)] - }) + df = pd.DataFrame( + { + "time": [i for i in range(50)], + "bool_col": [i % 2 == 0 for i in range(50)], + "int32_col": pd.Series([i for i in range(50)], dtype="int32"), + "int64_col": [i * 10 for i in range(50)], + "float_col": pd.Series( + [i * 1.5 for i in range(50)], dtype="float32" + ), + "double_col": [i * 2.5 for i in range(50)], + "string_col": [f"str{i}" for i in range(50)], + "blob_col": [f"blob{i}".encode("utf-8") for i in range(50)], + "text_col": [f"text{i}" for i in range(50)], + "date_col": [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], + "timestamp_col": [i for i in range(50)], + } + ) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) assert df_read["int32_col"].equals(df_sorted["int32_col"]) @@ -248,26 +289,34 @@ def test_write_dataframe_all_datatypes(): def test_write_dataframe_schema_time_column(): - table = TableSchema("test_table", - [ColumnSchema("time", TSDataType.TIMESTAMP, ColumnCategory.TIME), - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("time", TSDataType.TIMESTAMP, ColumnCategory.TIME), + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_schema_time.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [i * 100 for i in range(50)], - 'device': [f"device{i}" for i in range(50)], - 'value': [i * 1.5 for i in range(50)] - }) + df = pd.DataFrame( + { + "time": [i * 100 for i in range(50)], + "device": [f"device{i}" for i in range(50)], + "value": [i * 1.5 for i in range(50)], + } + ) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (50, 3) assert df_read[TIME_COLUMN].equals(df_sorted[TIME_COLUMN]) assert df_read["device"].equals(df_sorted["device"]) @@ -278,26 +327,32 @@ def test_write_dataframe_schema_time_column(): def test_write_dataframe_schema_time_and_dataframe_time(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) tsfile_path = "test_write_dataframe_schema_and_df_time.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'Time': [i for i in range(30)], - 'device': [f"dev{i}" for i in range(30)], - 'value': [float(i) for i in range(30)] - }) + df = pd.DataFrame( + { + "Time": [i for i in range(30)], + "device": [f"dev{i}" for i in range(30)], + "value": [float(i) for i in range(30)], + } + ) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types( - df.sort_values('Time').rename(columns=str.lower).reset_index(drop=True) + df.sort_values("Time").rename(columns=str.lower).reset_index(drop=True) ) assert df_read.shape == (30, 3) assert df_read["time"].equals(df_sorted["time"]) @@ -309,18 +364,16 @@ def test_write_dataframe_schema_time_and_dataframe_time(): def test_write_dataframe_empty(): - table = TableSchema("test_table", - [ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", [ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)] + ) tsfile_path = "test_write_dataframe_empty.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) with TsFileTableWriter(tsfile_path, table) as writer: - df = pd.DataFrame({ - 'time': [], - 'value': [] - }) + df = pd.DataFrame({"time": [], "value": []}) with pytest.raises(ValueError): writer.write_dataframe(df) diff --git a/python/tests/test_load_tsfile_from_iotdb.py b/python/tests/test_load_tsfile_from_iotdb.py index 21347c9e6..0263c7125 100644 --- a/python/tests/test_load_tsfile_from_iotdb.py +++ b/python/tests/test_load_tsfile_from_iotdb.py @@ -26,8 +26,8 @@ def test_load_tsfile_from_iotdb(): test_path = os.path.dirname(os.path.abspath(__file__)) - dir_path = os.path.join(test_path, 'resources') - simple_tree_path = os.path.join(dir_path, 'simple_tree.tsfile') + dir_path = os.path.join(test_path, "resources") + simple_tree_path = os.path.join(dir_path, "simple_tree.tsfile") df = ts.to_dataframe(simple_tree_path) ## -------- @@ -41,14 +41,14 @@ def test_load_tsfile_from_iotdb(): ## --------- # - simple_tabl1_path = os.path.join(dir_path, 'simple_table_t1.tsfile') + simple_tabl1_path = os.path.join(dir_path, "simple_table_t1.tsfile") df = ts.to_dataframe(simple_tabl1_path) ## --------- assert len(df) == 60 assert df[TIME_COLUMN].isna().sum() == 0 assert df[TIME_COLUMN].sum() == ( - (1760106020000 + 1760106049000) * 30 // 2 + - (1760106080000 + 1760106109000) * 30 // 2 + (1760106020000 + 1760106049000) * 30 // 2 + + (1760106080000 + 1760106109000) * 30 // 2 ) assert df["s0"].isna().sum() == 0 df_s0 = df["s0"] @@ -59,16 +59,14 @@ def test_load_tsfile_from_iotdb(): assert df["s4"].nunique() == 60 assert df["s5"].isna().sum() == 0 - assert df["s5"].sum() == ( - (1010 + 1039) * 30 // 2 + - (1070 + 1099) * 30 // 2 - ) + assert df["s5"].sum() == ((1010 + 1039) * 30 // 2 + (1070 + 1099) * 30 // 2) assert df["s6"].isna().sum() == 8 assert df["s6"].sum(skipna=True) == ( - (20 + 49) * 30 // 2 - (26 + 33 + 39 + 46) - + - (80 + 109) * 30 // 2 - (86 + 93 + 99 + 106) + (20 + 49) * 30 // 2 + - (26 + 33 + 39 + 46) + + (80 + 109) * 30 // 2 + - (86 + 93 + 99 + 106) ) assert df["s7"].isna().sum() == 0 assert df["s8"].isna().sum() == 0 @@ -82,7 +80,7 @@ def test_load_tsfile_from_iotdb(): ## --------- - simple_tabl2_path = os.path.join(dir_path, 'simple_table_t2.tsfile') + simple_tabl2_path = os.path.join(dir_path, "simple_table_t2.tsfile") df = ts.to_dataframe(simple_tabl2_path) ## --------- assert len(df) == 40 @@ -117,7 +115,9 @@ def test_load_tsfile_from_iotdb(): assert df["s9"].isna().sum() == 5 ## --------- - table_with_time_column_path = os.path.join(dir_path, 'table_with_time_column.tsfile') + table_with_time_column_path = os.path.join( + dir_path, "table_with_time_column.tsfile" + ) df = ts.to_dataframe(table_with_time_column_path) assert list(df.columns)[0] == "id" @@ -127,21 +127,29 @@ def test_load_tsfile_from_iotdb(): assert (df["region_id"] == "loc").sum() == 25 df_id = df["id"] - df = ts.to_dataframe(table_with_time_column_path, table_name="table2", - column_names=["region_id", "temperature", "humidity"]) + df = ts.to_dataframe( + table_with_time_column_path, + table_name="table2", + column_names=["region_id", "temperature", "humidity"], + ) assert list(df.columns)[0] == "id" assert len(df) == 25 assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert (df["region_id"] == "loc").sum() == 25 - df = ts.to_dataframe(table_with_time_column_path, table_name="table2", - column_names=["id", "temperature", "humidity"]) + df = ts.to_dataframe( + table_with_time_column_path, + table_name="table2", + column_names=["id", "temperature", "humidity"], + ) assert list(df.columns)[0] == "time" assert df["id"].equals(df["time"]) assert len(df) == 25 assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) - df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["id"]) + df = ts.to_dataframe( + table_with_time_column_path, table_name="table2", column_names=["id"] + ) assert len(df.columns) == 2 assert df_id.equals(df["id"]) diff --git a/python/tests/test_query_by_row.py b/python/tests/test_query_by_row.py index e45cd1b20..29b71c6ad 100644 --- a/python/tests/test_query_by_row.py +++ b/python/tests/test_query_by_row.py @@ -19,8 +19,21 @@ import pytest -from tsfile import ColumnCategory, ColumnSchema, Field, RowRecord, TableSchema, TSDataType -from tsfile import TimeseriesSchema, TsFileReader, TsFileTableWriter, TsFileWriter, Tablet +from tsfile import ( + ColumnCategory, + ColumnSchema, + Field, + RowRecord, + TableSchema, + TSDataType, +) +from tsfile import ( + TimeseriesSchema, + TsFileReader, + TsFileTableWriter, + TsFileWriter, + Tablet, +) def test_query_tree_by_row_offset_limit(): @@ -36,7 +49,9 @@ def test_query_tree_by_row_offset_limit(): writer = TsFileWriter(file_path) for device_id in device_ids: for measurement in measurement_names: - writer.register_timeseries(device_id, TimeseriesSchema(measurement, TSDataType.INT64)) + writer.register_timeseries( + device_id, TimeseriesSchema(measurement, TSDataType.INT64) + ) for t in range(num_rows): for dev_idx, device_id in enumerate(device_ids): @@ -51,7 +66,9 @@ def test_query_tree_by_row_offset_limit(): reader = TsFileReader(file_path) offset = 3 limit = 5 - with reader.query_tree_by_row(device_ids, measurement_names, offset, limit) as result: + with reader.query_tree_by_row( + device_ids, measurement_names, offset, limit + ) as result: row = 0 while result.next(): ts = result.get_value_by_index(1) @@ -86,7 +103,9 @@ def test_query_table_by_row_offset_limit(): num_rows = 10 with TsFileTableWriter(file_path, schema) as writer: - tablet = Tablet(["device", "s1"], [TSDataType.STRING, TSDataType.INT64], num_rows) + tablet = Tablet( + ["device", "s1"], [TSDataType.STRING, TSDataType.INT64], num_rows + ) for t in range(num_rows): tablet.add_timestamp(t, t) tablet.add_value_by_name("device", t, f"device_{t}") @@ -96,7 +115,9 @@ def test_query_table_by_row_offset_limit(): reader = TsFileReader(file_path) offset = 3 limit = 5 - with reader.query_table_by_row(table_name, ["device", "s1"], offset, limit) as result: + with reader.query_table_by_row( + table_name, ["device", "s1"], offset, limit + ) as result: row = 0 while result.next(): ts = result.get_value_by_index(1) @@ -109,4 +130,3 @@ def test_query_table_by_row_offset_limit(): finally: if os.path.exists(file_path): os.remove(file_path) - diff --git a/python/tests/test_reader_metadata.py b/python/tests/test_reader_metadata.py index 558fcbb13..3a100c33f 100644 --- a/python/tests/test_reader_metadata.py +++ b/python/tests/test_reader_metadata.py @@ -39,10 +39,8 @@ def test_get_all_devices_segments(): device = "root.sg.py_details" writer = TsFileWriter(path) - writer.register_timeseries( - device, TimeseriesSchema("m", TSDataType.INT32)) - writer.write_row_record( - RowRecord(device, 1, [Field("m", 1, TSDataType.INT32)])) + writer.register_timeseries(device, TimeseriesSchema("m", TSDataType.INT32)) + writer.write_row_record(RowRecord(device, 1, [Field("m", 1, TSDataType.INT32)])) writer.close() reader = TsFileReader(path) @@ -75,8 +73,7 @@ def test_get_all_devices_and_timeseries_metadata_statistic(): device = "root.sg.py_meta" writer = TsFileWriter(path) - writer.register_timeseries( - device, TimeseriesSchema("m_int", TSDataType.INT32)) + writer.register_timeseries(device, TimeseriesSchema("m_int", TSDataType.INT32)) for row in range(3): v = (row + 1) * 10 writer.write_row_record( @@ -141,8 +138,7 @@ def test_get_timeseries_metadata_boolean_statistic(): device = "root.sg.py_bool" writer = TsFileWriter(path) - writer.register_timeseries( - device, TimeseriesSchema("m_b", TSDataType.BOOLEAN)) + writer.register_timeseries(device, TimeseriesSchema("m_b", TSDataType.BOOLEAN)) for row, b in enumerate([True, False, True]): writer.write_row_record( RowRecord( @@ -179,8 +175,7 @@ def test_get_timeseries_metadata_string_statistic(): device = "root.sg.py_str" writer = TsFileWriter(path) - writer.register_timeseries( - device, TimeseriesSchema("m_str", TSDataType.STRING)) + writer.register_timeseries(device, TimeseriesSchema("m_str", TSDataType.STRING)) for row, s in enumerate(["aa", "cc", "bb"]): writer.write_row_record( RowRecord( diff --git a/python/tests/test_tag_filter.py b/python/tests/test_tag_filter.py index 69f48af5e..141b477a9 100644 --- a/python/tests/test_tag_filter.py +++ b/python/tests/test_tag_filter.py @@ -21,10 +21,23 @@ import pytest from tsfile import ( - ColumnSchema, TableSchema, TSDataType, ColumnCategory, - TsFileTableWriter, TsFileReader, Tablet, - tag_eq, tag_neq, tag_lt, tag_lteq, tag_gt, tag_gteq, - tag_regexp, tag_not_regexp, tag_between, tag_not_between, + ColumnSchema, + TableSchema, + TSDataType, + ColumnCategory, + TsFileTableWriter, + TsFileReader, + Tablet, + tag_eq, + tag_neq, + tag_lt, + tag_lteq, + tag_gt, + tag_gteq, + tag_regexp, + tag_not_regexp, + tag_between, + tag_not_between, ) TSFILE_PATH = "test_tag_filter.tsfile" @@ -83,7 +96,9 @@ def create_tsfile(): def _query_values(reader, tag_filter): """Helper: query all columns with the given tag_filter, return list of (region, device, value) tuples.""" - result = reader.query_table(TABLE_NAME, ["region", "device", "value"], tag_filter=tag_filter) + result = reader.query_table( + TABLE_NAME, ["region", "device", "value"], tag_filter=tag_filter + ) rows = [] while result.next(): region = result.get_value_by_name("region") @@ -199,7 +214,9 @@ def test_tag_not(): def test_tag_complex_combination(): with TsFileReader(TSFILE_PATH) as reader: # (region == "north" AND device == "dev_b") OR region == "east" - f = (tag_eq("region", "north") & tag_eq("device", "dev_b")) | tag_eq("region", "east") + f = (tag_eq("region", "north") & tag_eq("device", "dev_b")) | tag_eq( + "region", "east" + ) rows = _query_values(reader, f) assert len(rows) == 10 # dev_b (5) + east (5) for r in rows: @@ -221,8 +238,10 @@ def test_tag_filter_with_time_range(): """Tag filter combined with time range.""" with TsFileReader(TSFILE_PATH) as reader: result = reader.query_table( - TABLE_NAME, ["region", "device", "value"], - start_time=0, end_time=7, + TABLE_NAME, + ["region", "device", "value"], + start_time=0, + end_time=7, tag_filter=tag_eq("region", "north"), ) rows = [] diff --git a/python/tests/test_tag_filter_query.py b/python/tests/test_tag_filter_query.py index 513fd0c44..1d2482b5d 100644 --- a/python/tests/test_tag_filter_query.py +++ b/python/tests/test_tag_filter_query.py @@ -22,9 +22,16 @@ import pytest from tsfile import ( - ColumnSchema, TableSchema, TSDataType, ColumnCategory, - TsFileTableWriter, TsFileReader, Tablet, - tag_eq, tag_gteq, TIME_COLUMN, + ColumnSchema, + TableSchema, + TSDataType, + ColumnCategory, + TsFileTableWriter, + TsFileReader, + Tablet, + tag_eq, + tag_gteq, + TIME_COLUMN, ) TSFILE_PATH = "test_tag_filter_query.tsfile" @@ -84,11 +91,13 @@ def create_tsfile(): def _scalar_rows(result): rows = [] while result.next(): - rows.append(( - result.get_value_by_name("region"), - result.get_value_by_name("device"), - result.get_value_by_name("value"), - )) + rows.append( + ( + result.get_value_by_name("region"), + result.get_value_by_name("device"), + result.get_value_by_name("value"), + ) + ) return rows @@ -107,11 +116,13 @@ def _arrow_rows(result): combined = pa.concat_tables(tables) rows = [] for i in range(combined.num_rows): - rows.append(( - combined.column("region")[i].as_py(), - combined.column("device")[i].as_py(), - combined.column("value")[i].as_py(), - )) + rows.append( + ( + combined.column("region")[i].as_py(), + combined.column("device")[i].as_py(), + combined.column("value")[i].as_py(), + ) + ) return rows @@ -123,7 +134,8 @@ class TestQueryTableTagFilterScalar: def test_eq_filter(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "north"), ) as result: rows = _scalar_rows(result) @@ -134,7 +146,8 @@ def test_and_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "north") & tag_eq("device", "dev_a") with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: rows = _scalar_rows(result) @@ -145,7 +158,8 @@ def test_or_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "south") | tag_eq("region", "east") with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: rows = _scalar_rows(result) @@ -155,8 +169,10 @@ def test_or_filter(self): def test_with_time_range(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], - start_time=0, end_time=7, + TABLE_NAME, + ["region", "device", "value"], + start_time=0, + end_time=7, tag_filter=tag_eq("region", "north"), ) as result: rows = _scalar_rows(result) @@ -166,7 +182,8 @@ def test_with_time_range(self): def test_no_match(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "west"), ) as result: rows = _scalar_rows(result) @@ -181,7 +198,8 @@ class TestQueryTableTagFilterArrow: def test_eq_filter(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "north"), batch_size=1024, ) as result: @@ -193,7 +211,8 @@ def test_and_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "north") & tag_eq("device", "dev_b") with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: @@ -205,7 +224,8 @@ def test_or_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "south") | tag_eq("region", "east") with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: @@ -216,8 +236,10 @@ def test_or_filter(self): def test_with_time_range(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], - start_time=0, end_time=7, + TABLE_NAME, + ["region", "device", "value"], + start_time=0, + end_time=7, tag_filter=tag_eq("region", "north"), batch_size=1024, ) as result: @@ -227,7 +249,8 @@ def test_with_time_range(self): def test_small_batch_size(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "north"), batch_size=3, ) as result: @@ -238,7 +261,8 @@ def test_small_batch_size(self): def test_no_match(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "west"), batch_size=1024, ) as result: @@ -254,7 +278,8 @@ class TestQueryTableByRowTagFilterScalar: def test_eq_filter(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "north"), ) as result: rows = _scalar_rows(result) @@ -265,7 +290,8 @@ def test_and_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "north") & tag_eq("device", "dev_a") with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: rows = _scalar_rows(result) @@ -276,7 +302,8 @@ def test_or_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "south") | tag_eq("region", "east") with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: rows = _scalar_rows(result) @@ -286,8 +313,10 @@ def test_or_filter(self): def test_with_offset_limit(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], - offset=2, limit=3, + TABLE_NAME, + ["region", "device", "value"], + offset=2, + limit=3, tag_filter=tag_eq("region", "north"), ) as result: rows = _scalar_rows(result) @@ -296,7 +325,8 @@ def test_with_offset_limit(self): def test_gteq_filter(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_gteq("region", "north"), ) as result: rows = _scalar_rows(result) @@ -307,7 +337,8 @@ def test_gteq_filter(self): def test_no_match(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "west"), ) as result: rows = _scalar_rows(result) @@ -322,7 +353,8 @@ class TestQueryTableByRowTagFilterArrow: def test_eq_filter(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "north"), batch_size=1024, ) as result: @@ -334,7 +366,8 @@ def test_and_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "north") & tag_eq("device", "dev_b") with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: @@ -346,7 +379,8 @@ def test_or_filter(self): with TsFileReader(TSFILE_PATH) as reader: f = tag_eq("region", "south") | tag_eq("region", "east") with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: @@ -357,8 +391,10 @@ def test_or_filter(self): def test_with_offset_limit(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], - offset=0, limit=6, + TABLE_NAME, + ["region", "device", "value"], + offset=0, + limit=6, tag_filter=tag_eq("region", "north"), batch_size=1024, ) as result: @@ -369,7 +405,8 @@ def test_with_offset_limit(self): def test_small_batch_size(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "south"), batch_size=2, ) as result: @@ -380,7 +417,8 @@ def test_small_batch_size(self): def test_no_match(self): with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=tag_eq("region", "west"), batch_size=1024, ) as result: @@ -397,14 +435,16 @@ def test_query_table_scalar_vs_arrow(self): f = tag_eq("region", "north") & tag_eq("device", "dev_a") with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: scalar_rows = _scalar_rows(result) with TsFileReader(TSFILE_PATH) as reader: with reader.query_table( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: @@ -420,14 +460,16 @@ def test_query_table_by_row_scalar_vs_arrow(self): f = tag_eq("region", "south") with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, ) as result: scalar_rows = _scalar_rows(result) with TsFileReader(TSFILE_PATH) as reader: with reader.query_table_by_row( - TABLE_NAME, ["region", "device", "value"], + TABLE_NAME, + ["region", "device", "value"], tag_filter=f, batch_size=1024, ) as result: diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index efd607feb..446403867 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -30,16 +30,16 @@ def convert_to_nullable_types(df): df = df.copy() for col in df.columns: dtype = df[col].dtype - if dtype == 'int64': - df[col] = df[col].astype('Int64') - elif dtype == 'int32': - df[col] = df[col].astype('Int32') - elif dtype == 'float64': - df[col] = df[col].astype('Float64') - elif dtype == 'float32': - df[col] = df[col].astype('Float32') - elif dtype == 'bool': - df[col] = df[col].astype('boolean') + if dtype == "int64": + df[col] = df[col].astype("Int64") + elif dtype == "int32": + df[col] = df[col].astype("Int32") + elif dtype == "float64": + df[col] = df[col].astype("Float64") + elif dtype == "float32": + df[col] = df[col].astype("Float32") + elif dtype == "bool": + df[col] = df[col].astype("boolean") return df @@ -49,18 +49,22 @@ def test_dataframe_to_tsfile_basic(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(100)], - 'device': [f"device{i}" for i in range(100)], - 'value': [i * 1.5 for i in range(100)], - 'value2': [i * 10 for i in range(100)] - }) + df = pd.DataFrame( + { + "time": [i for i in range(100)], + "device": [f"device{i}" for i in range(100)], + "value": [i * 1.5 for i in range(100)], + "value2": [i * 10 for i in range(100)], + } + ) dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_read = df_read.sort_values("time").reset_index(drop=True) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (100, 4) assert df_read["time"].equals(df_sorted["time"]) @@ -78,7 +82,7 @@ def test_dataframe_to_tsfile_default_table_name(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]}) + df = pd.DataFrame({"time": [0, 1], "value": [1.0, 2.0]}) dataframe_to_tsfile(df, tsfile_path) df_read = to_dataframe(tsfile_path, table_name="default_table") @@ -94,16 +98,18 @@ def test_dataframe_to_tsfile_with_index(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'device': [f"device{i}" for i in range(30)], - 'value': [i * 2.0 for i in range(30)] - }) + df = pd.DataFrame( + { + "device": [f"device{i}" for i in range(30)], + "value": [i * 2.0 for i in range(30)], + } + ) df.index = [i * 100 for i in range(30)] dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) - time_expected = pd.Series(df.index.values, dtype='Int64') + df_read = df_read.sort_values("time").reset_index(drop=True) + time_expected = pd.Series(df.index.values, dtype="Int64") assert df_read.shape == (30, 3) assert df_read["time"].equals(time_expected) @@ -123,17 +129,23 @@ def test_dataframe_to_tsfile_custom_time_column(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'timestamp': [i for i in range(30)], - 'device': [f"device{i}" for i in range(30)], - 'value': [i * 3.0 for i in range(30)] - }) + df = pd.DataFrame( + { + "timestamp": [i for i in range(30)], + "device": [f"device{i}" for i in range(30)], + "value": [i * 3.0 for i in range(30)], + } + ) - dataframe_to_tsfile(df, tsfile_path, table_name="test_table", time_column="timestamp") + dataframe_to_tsfile( + df, tsfile_path, table_name="test_table", time_column="timestamp" + ) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values("timestamp").reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("timestamp").reset_index(drop=True) + ) assert df_read.shape == (30, 3) assert df_read["timestamp"].equals(df_sorted["timestamp"]) @@ -150,16 +162,15 @@ def test_dataframe_to_tsfile_case_insensitive_time(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'Time': [i for i in range(20)], - 'value': [i * 2.0 for i in range(20)] - }) + df = pd.DataFrame( + {"Time": [i for i in range(20)], "value": [i * 2.0 for i in range(20)]} + ) dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") assert df_read.shape == (20, 2) - assert df_read["time"].equals(pd.Series([i for i in range(20)], dtype='Int64')) + assert df_read["time"].equals(pd.Series([i for i in range(20)], dtype="Int64")) finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) @@ -171,18 +182,24 @@ def test_dataframe_to_tsfile_with_tag_columns(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(20)], - 'device': [f"device{i}" for i in range(20)], - 'location': [f"loc{i % 5}" for i in range(20)], - 'value': [i * 1.5 for i in range(20)] - }) + df = pd.DataFrame( + { + "time": [i for i in range(20)], + "device": [f"device{i}" for i in range(20)], + "location": [f"loc{i % 5}" for i in range(20)], + "value": [i * 1.5 for i in range(20)], + } + ) - dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device", "location"]) + dataframe_to_tsfile( + df, tsfile_path, table_name="test_table", tag_column=["device", "location"] + ) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (20, 4) assert df_read["device"].equals(df_sorted["device"]) @@ -199,17 +216,31 @@ def test_dataframe_to_tsfile_tag_time_unsorted(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [30, 10, 20, 50, 40, 15, 25, 35, 5, 45], - 'device': ['device1', 'device1', 'device1', 'device2', 'device2', 'device1', 'device1', 'device2', - 'device1', 'device2'], - 'value': [i * 1.5 for i in range(10)] - }) - - dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device"]) + df = pd.DataFrame( + { + "time": [30, 10, 20, 50, 40, 15, 25, 35, 5, 45], + "device": [ + "device1", + "device1", + "device1", + "device2", + "device2", + "device1", + "device1", + "device2", + "device1", + "device2", + ], + "value": [i * 1.5 for i in range(10)], + } + ) + + dataframe_to_tsfile( + df, tsfile_path, table_name="test_table", tag_column=["device"] + ) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_expected = df.sort_values(by=['device', 'time']).reset_index(drop=True) + df_expected = df.sort_values(by=["device", "time"]).reset_index(drop=True) df_expected = convert_to_nullable_types(df_expected) assert df_read.shape == (10, 3) @@ -227,25 +258,29 @@ def test_dataframe_to_tsfile_all_datatypes(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(50)], - 'bool_col': [i % 2 == 0 for i in range(50)], - 'int32_col': pd.Series([i for i in range(50)], dtype='int32'), - 'int64_col': [i * 10 for i in range(50)], - 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), - 'double_col': [i * 2.5 for i in range(50)], - 'string_col': [f"str{i}" for i in range(50)], - 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)], - 'text_col': [f"text{i}" for i in range(50)], - 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], - 'timestamp_col': [i for i in range(50)] - }) + df = pd.DataFrame( + { + "time": [i for i in range(50)], + "bool_col": [i % 2 == 0 for i in range(50)], + "int32_col": pd.Series([i for i in range(50)], dtype="int32"), + "int64_col": [i * 10 for i in range(50)], + "float_col": pd.Series([i * 1.5 for i in range(50)], dtype="float32"), + "double_col": [i * 2.5 for i in range(50)], + "string_col": [f"str{i}" for i in range(50)], + "blob_col": [f"blob{i}".encode("utf-8") for i in range(50)], + "text_col": [f"text{i}" for i in range(50)], + "date_col": [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], + "timestamp_col": [i for i in range(50)], + } + ) dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + df_sorted = convert_to_nullable_types( + df.sort_values("time").reset_index(drop=True) + ) assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) @@ -285,11 +320,11 @@ def test_dataframe_to_tsfile_no_data_columns(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(10)] - }) + df = pd.DataFrame({"time": [i for i in range(10)]}) - with pytest.raises(ValueError, match="DataFrame must have at least one data column"): + with pytest.raises( + ValueError, match="DataFrame must have at least one data column" + ): dataframe_to_tsfile(df, tsfile_path) finally: if os.path.exists(tsfile_path): @@ -303,7 +338,9 @@ def test_dataframe_to_tsfile_only_time_column_raises(): if os.path.exists(tsfile_path): os.remove(tsfile_path) df = pd.DataFrame({"time": [1, 2, 3]}) - with pytest.raises(ValueError, match="at least one data column besides the time column"): + with pytest.raises( + ValueError, match="at least one data column besides the time column" + ): dataframe_to_tsfile(df, tsfile_path) finally: if os.path.exists(tsfile_path): @@ -316,7 +353,7 @@ def test_dataframe_to_tsfile_time_column_not_found(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]}) + df = pd.DataFrame({"time": [0, 1], "value": [1.0, 2.0]}) with pytest.raises(ValueError, match="Time column 'timestamp' not found"): dataframe_to_tsfile(df, tsfile_path, time_column="timestamp") finally: @@ -330,10 +367,9 @@ def test_dataframe_to_tsfile_invalid_time_column(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'timestamp': [i for i in range(10)], - 'value': [i * 1.0 for i in range(10)] - }) + df = pd.DataFrame( + {"timestamp": [i for i in range(10)], "value": [i * 1.0 for i in range(10)]} + ) with pytest.raises(ValueError, match="Time column 'time' not found"): dataframe_to_tsfile(df, tsfile_path, time_column="time") @@ -348,10 +384,12 @@ def test_dataframe_to_tsfile_non_integer_time_column(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [f"time{i}" for i in range(10)], - 'value': [i * 1.0 for i in range(10)] - }) + df = pd.DataFrame( + { + "time": [f"time{i}" for i in range(10)], + "value": [i * 1.0 for i in range(10)], + } + ) with pytest.raises(TypeError, match="must be integer type"): dataframe_to_tsfile(df, tsfile_path) @@ -366,7 +404,7 @@ def test_dataframe_to_tsfile_tag_column_not_found(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({'time': [0, 1], 'device': ['a', 'b'], 'value': [1.0, 2.0]}) + df = pd.DataFrame({"time": [0, 1], "device": ["a", "b"], "value": [1.0, 2.0]}) with pytest.raises(ValueError, match="Tag column 'invalid' not found"): dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"]) finally: @@ -380,10 +418,9 @@ def test_dataframe_to_tsfile_invalid_tag_column(): if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(10)], - 'value': [i * 1.0 for i in range(10)] - }) + df = pd.DataFrame( + {"time": [i for i in range(10)], "value": [i * 1.0 for i in range(10)]} + ) with pytest.raises(ValueError, match="Tag column 'invalid' not found"): dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"]) diff --git a/python/tests/test_tsfile_dataset.py b/python/tests/test_tsfile_dataset.py index 63cd439a1..c51c52992 100644 --- a/python/tests/test_tsfile_dataset.py +++ b/python/tests/test_tsfile_dataset.py @@ -20,7 +20,13 @@ import pandas as pd import pytest -from tsfile import ColumnCategory, ColumnSchema, TSDataType, TableSchema, TsFileTableWriter +from tsfile import ( + ColumnCategory, + ColumnSchema, + TSDataType, + TableSchema, + TsFileTableWriter, +) from tsfile import AlignedTimeseries, Timeseries, TsFileDataFrame from tsfile.dataset.formatting import format_timestamp from tsfile.dataset.reader import TsFileSeriesReader @@ -314,18 +320,22 @@ def test_dataset_multi_tag_metadata_discovery(tmp_path): "weather.shanghai.device_b.humidity", ] - summary = pd.DataFrame( - { - "series_path": tsdf.list_timeseries(), - "table": tsdf["table"], - "city": tsdf["city"], - "device": tsdf["device"], - "field": tsdf["field"], - "start_time": tsdf["start_time"], - "end_time": tsdf["end_time"], - "count": tsdf["count"], - } - ).sort_values(["city", "device", "field"]).reset_index(drop=True) + summary = ( + pd.DataFrame( + { + "series_path": tsdf.list_timeseries(), + "table": tsdf["table"], + "city": tsdf["city"], + "device": tsdf["device"], + "field": tsdf["field"], + "start_time": tsdf["start_time"], + "end_time": tsdf["end_time"], + "count": tsdf["count"], + } + ) + .sort_values(["city", "device", "field"]) + .reset_index(drop=True) + ) assert list(summary.columns) == [ "series_path", "table", @@ -337,8 +347,18 @@ def test_dataset_multi_tag_metadata_discovery(tmp_path): "count", ] assert list(summary["city"]) == ["beijing", "beijing", "shanghai", "shanghai"] - assert list(summary["device"]) == ["device_a", "device_a", "device_b", "device_b"] - assert list(summary["field"]) == ["humidity", "temperature", "humidity", "temperature"] + assert list(summary["device"]) == [ + "device_a", + "device_a", + "device_b", + "device_b", + ] + assert list(summary["field"]) == [ + "humidity", + "temperature", + "humidity", + "temperature", + ] assert list(summary["count"]) == [2, 2, 2, 2] diff --git a/python/tests/test_write.py b/python/tests/test_write.py index cae992b28..88540d5e2 100644 --- a/python/tests/test_write.py +++ b/python/tests/test_write.py @@ -24,19 +24,21 @@ from tsfile import Tablet, RowRecord, Field from tsfile import TSDataType + def test_row_record_write(): try: writer = TsFileWriter("record_write.tsfile") timeseries = TimeseriesSchema("level1", TSDataType.INT64) writer.register_timeseries("root.device1", timeseries) - record = RowRecord("root.device1", 10,[Field("level1", 10, TSDataType.INT64)]) + record = RowRecord("root.device1", 10, [Field("level1", 10, TSDataType.INT64)]) writer.write_row_record(record) writer.close() finally: if os.path.exists("record_write.tsfile"): os.remove("record_write.tsfile") + def test_tablet_write(): try: writer = TsFileWriter("tablet_write.tsfile") @@ -45,7 +47,9 @@ def test_tablet_write(): device = DeviceSchema("root.device1", [timeseries1, timeseries2]) writer.register_device(device) - tablet = Tablet(["level1", "level2"], [TSDataType.INT64, TSDataType.DOUBLE], 100) + tablet = Tablet( + ["level1", "level2"], [TSDataType.INT64, TSDataType.DOUBLE], 100 + ) tablet.set_table_name("root.device1") for i in range(100): tablet.add_timestamp(i, i) @@ -58,6 +62,7 @@ def test_tablet_write(): if os.path.exists("tablet_write.tsfile"): os.remove("tablet_write.tsfile") + def test_tablet_write(): try: writer = TsFileWriter("tablet_write.tsfile") @@ -66,7 +71,9 @@ def test_tablet_write(): device = DeviceSchema("root.device1", [timeseries1, timeseries2]) writer.register_device(device) - tablet = Tablet(["level1", "level2"], [TSDataType.INT64, TSDataType.DOUBLE], 100) + tablet = Tablet( + ["level1", "level2"], [TSDataType.INT64, TSDataType.DOUBLE], 100 + ) tablet.set_table_name("root.device1") for i in range(100): tablet.add_timestamp(i, i) @@ -79,6 +86,7 @@ def test_tablet_write(): if os.path.exists("tablet_write.tsfile"): os.remove("tablet_write.tsfile") + def test_table_write(): try: with TsFileWriter("table_write.tsfile") as writer: @@ -90,9 +98,16 @@ def test_table_write(): writer.register_table(table) row_num = 100 - tablet = Tablet( ["device", "sensor", "value1", "value2"], - [TSDataType.STRING, TSDataType.STRING, TSDataType.DOUBLE, TSDataType.INT32], - row_num) + tablet = Tablet( + ["device", "sensor", "value1", "value2"], + [ + TSDataType.STRING, + TSDataType.STRING, + TSDataType.DOUBLE, + TSDataType.INT32, + ], + row_num, + ) tablet.set_table_name("test_table") for i in range(100): tablet.add_timestamp(i, i) @@ -106,6 +121,7 @@ def test_table_write(): if os.path.exists("table_write.tsfile"): os.remove("table_write.tsfile") + def test_flush(): file_name = "table_flush.tsfile" try: @@ -117,9 +133,9 @@ def test_flush(): writer.register_table(table) row_num = 100 - tablet = Tablet(["item_id", "value"], - [TSDataType.STRING, TSDataType.DOUBLE], - row_num) + tablet = Tablet( + ["item_id", "value"], [TSDataType.STRING, TSDataType.DOUBLE], row_num + ) tablet.set_table_name("test_flush") for i in range(100): tablet.add_timestamp(i, i) @@ -132,12 +148,3 @@ def test_flush(): finally: if os.path.exists(file_name): os.remove(file_name) - - - - - - - - - diff --git a/python/tests/test_write_and_read.py b/python/tests/test_write_and_read.py index 57294a846..e76fef538 100644 --- a/python/tests/test_write_and_read.py +++ b/python/tests/test_write_and_read.py @@ -33,7 +33,12 @@ from tsfile import TsFileTableWriter from tsfile import TsFileWriter, TsFileReader, ColumnCategory from tsfile import to_dataframe -from tsfile.exceptions import TableNotExistError, ColumnNotExistError, NotSupportedError, TypeMismatchError +from tsfile.exceptions import ( + TableNotExistError, + ColumnNotExistError, + NotSupportedError, + TypeMismatchError, +) def test_row_record_write_and_read(): @@ -41,27 +46,48 @@ def test_row_record_write_and_read(): if os.path.exists("record_write_and_read.tsfile"): os.remove("record_write_and_read.tsfile") writer = TsFileWriter("record_write_and_read.tsfile") - writer.register_timeseries("root.device1", TimeseriesSchema("level1", TSDataType.INT64)) - writer.register_timeseries("root.device1", TimeseriesSchema("level2", TSDataType.DOUBLE)) - writer.register_timeseries("root.device1", TimeseriesSchema("level3", TSDataType.INT32)) - writer.register_timeseries("root.device1", TimeseriesSchema("level4", TSDataType.STRING)) - writer.register_timeseries("root.device1", TimeseriesSchema("level5", TSDataType.TEXT)) - writer.register_timeseries("root.device1", TimeseriesSchema("level6", TSDataType.BLOB)) - writer.register_timeseries("root.device1", TimeseriesSchema("level7", TSDataType.DATE)) - writer.register_timeseries("root.device1", TimeseriesSchema("level8", TSDataType.TIMESTAMP)) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level1", TSDataType.INT64) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level2", TSDataType.DOUBLE) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level3", TSDataType.INT32) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level4", TSDataType.STRING) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level5", TSDataType.TEXT) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level6", TSDataType.BLOB) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level7", TSDataType.DATE) + ) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level8", TSDataType.TIMESTAMP) + ) max_row_num = 10 for i in range(max_row_num): - row = RowRecord("root.device1", i, - [Field("level1", i + 1, TSDataType.INT64), - Field("level2", i * 1.1, TSDataType.DOUBLE), - Field("level3", i * 2, TSDataType.INT32), - Field("level4", f"string_value_{i}", TSDataType.STRING), - Field("level5", f"text_value_{i}", TSDataType.TEXT), - Field("level6", f"blob_data_{i}".encode('utf-8'), TSDataType.BLOB), - Field("level7", date(2025, 1, i % 20 + 1), TSDataType.DATE), - Field("level8", i, TSDataType.TIMESTAMP)]) + row = RowRecord( + "root.device1", + i, + [ + Field("level1", i + 1, TSDataType.INT64), + Field("level2", i * 1.1, TSDataType.DOUBLE), + Field("level3", i * 2, TSDataType.INT32), + Field("level4", f"string_value_{i}", TSDataType.STRING), + Field("level5", f"text_value_{i}", TSDataType.TEXT), + Field("level6", f"blob_data_{i}".encode("utf-8"), TSDataType.BLOB), + Field("level7", date(2025, 1, i % 20 + 1), TSDataType.DATE), + Field("level8", i, TSDataType.TIMESTAMP), + ], + ) writer.write_row_record(row) writer.close() @@ -69,7 +95,16 @@ def test_row_record_write_and_read(): reader = TsFileReader("record_write_and_read.tsfile") result = reader.query_timeseries( "root.device1", - ["level1", "level2", "level3", "level4", "level5", "level6", "level7", "level8"], + [ + "level1", + "level2", + "level3", + "level4", + "level5", + "level6", + "level7", + "level8", + ], 0, 100, ) @@ -84,7 +119,9 @@ def test_row_record_write_and_read(): assert result.get_value_by_index(4) == row_num * 2 assert result.get_value_by_index(5) == f"string_value_{row_num}" assert result.get_value_by_index(6) == f"text_value_{row_num}" - assert result.get_value_by_index(7) == f"blob_data_{row_num}".encode('utf-8') + assert result.get_value_by_index(7) == f"blob_data_{row_num}".encode( + "utf-8" + ) assert result.get_value_by_index(8) == date(2025, 1, row_num % 20 + 1) assert result.get_value_by_index(9) == row_num @@ -95,8 +132,6 @@ def test_row_record_write_and_read(): assert len(reader.get_active_query_result()) == 0 reader.close() - - finally: if os.path.exists("record_write_and_read.tsfile"): os.remove("record_write_and_read.tsfile") @@ -192,7 +227,10 @@ def _extract_device(row, path_columns): requested_columns = ["level", "temperature"] df_subset = to_dataframe( - file_path, column_names=requested_columns, start_time=0, end_time=rows_per_device + file_path, + column_names=requested_columns, + start_time=0, + end_time=rows_per_device, ) for column in requested_columns: assert column in df_subset.columns @@ -211,7 +249,11 @@ def _extract_device(row, path_columns): assert _is_null(value) assert device in device_path_map df_limited = to_dataframe( - file_path, column_names=["level"], max_row_num=5, start_time=0, end_time=rows_per_device + file_path, + column_names=["level"], + max_row_num=5, + start_time=0, + end_time=rows_per_device, ) assert df_limited.shape[0] == 5 assert "level" in df_limited.columns @@ -324,19 +366,26 @@ def test_tablet_write_and_read(): writer = TsFileWriter("tablet_write_and_read.tsfile") measurement_num = 30 for i in range(measurement_num): - writer.register_timeseries("root.device1", TimeseriesSchema('level' + str(i), TSDataType.INT64)) + writer.register_timeseries( + "root.device1", TimeseriesSchema("level" + str(i), TSDataType.INT64) + ) max_row_num = 10000 tablet_row_num = 1000 tablet_num = 0 for i in range(max_row_num // tablet_row_num): - tablet = Tablet([f'level{j}' for j in range(measurement_num)], - [TSDataType.INT64 for _ in range(measurement_num)], tablet_row_num) + tablet = Tablet( + [f"level{j}" for j in range(measurement_num)], + [TSDataType.INT64 for _ in range(measurement_num)], + tablet_row_num, + ) tablet.set_table_name("root.device1") for row in range(tablet_row_num): tablet.add_timestamp(row, row + tablet_num * tablet_row_num) for col in range(measurement_num): - tablet.add_value_by_index(col, row, row + tablet_num * tablet_row_num) + tablet.add_value_by_index( + col, row, row + tablet_num * tablet_row_num + ) writer.write_tablet(tablet) tablet_num += 1 @@ -364,15 +413,20 @@ def test_tablet_write_and_read(): def test_table_writer_and_reader(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) try: if os.path.exists("table_write.tsfile"): os.remove("table_write.tsfile") with TsFileTableWriter("table_write.tsfile", table) as writer: - tablet = Tablet(["device", "value"], - [TSDataType.STRING, TSDataType.DOUBLE], 100) + tablet = Tablet( + ["device", "value"], [TSDataType.STRING, TSDataType.DOUBLE], 100 + ) for i in range(100): tablet.add_timestamp(i, i) tablet.add_value_by_name("device", i, "device" + str(i)) @@ -380,12 +434,13 @@ def test_table_writer_and_reader(): writer.write_table(tablet) with TsFileReader("table_write.tsfile") as reader: - with reader.query_table("test_table", ["device", "value"], - 0, 10) as result: + with reader.query_table("test_table", ["device", "value"], 0, 10) as result: cur_line = 0 while result.next(): cur_time = result.get_value_by_name(TIME_COLUMN) - assert result.get_value_by_name("device") == "device" + str(cur_time) + assert result.get_value_by_name("device") == "device" + str( + cur_time + ) assert result.is_null_by_name("device") == False assert result.is_null_by_name("value") == False assert result.is_null_by_index(1) == False @@ -394,8 +449,9 @@ def test_table_writer_and_reader(): assert result.get_value_by_name("value") == cur_time * 100.0 cur_line = cur_line + 1 assert cur_line == 11 - with reader.query_table("test_table", ["device", "value"], - 0, 100) as result: + with reader.query_table( + "test_table", ["device", "value"], 0, 100 + ) as result: line_num = 0 print("dataframe") while result.next(): @@ -412,8 +468,10 @@ def test_table_writer_and_reader(): tableSchema = schemas["test_table"] assert tableSchema.get_table_name() == "test_table" print(tableSchema) - assert tableSchema.__repr__() == ("TableSchema(test_table, [ColumnSchema(device," - " STRING, TAG), ColumnSchema(value, DOUBLE, FIELD)])") + assert tableSchema.__repr__() == ( + "TableSchema(test_table, [ColumnSchema(device," + " STRING, TAG), ColumnSchema(value, DOUBLE, FIELD)])" + ) finally: if os.path.exists("table_write.tsfile"): os.remove("table_write.tsfile") @@ -427,8 +485,7 @@ def test_query_result_detach_from_reader(): writer.register_timeseries("root.device1", timeseries) max_row_num = 1000 for i in range(max_row_num): - row = RowRecord("root.device1", i, - [Field("level1", i, TSDataType.INT64)]) + row = RowRecord("root.device1", i, [Field("level1", i, TSDataType.INT64)]) writer.write_row_record(row) writer.close() @@ -453,9 +510,13 @@ def test_query_result_detach_from_reader(): def test_lower_case_name(): if os.path.exists("lower_case_name.tsfile"): os.remove("lower_case_name.tsfile") - table = TableSchema("tEst_Table", - [ColumnSchema("Device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("vAlue", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "tEst_Table", + [ + ColumnSchema("Device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("vAlue", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) with TsFileTableWriter("lower_case_name.tsfile", table) as writer: tablet = Tablet(["device", "VALUE"], [TSDataType.STRING, TSDataType.DOUBLE]) for i in range(100): @@ -479,9 +540,13 @@ def test_tsfile_config(): config = get_tsfile_config() - table = TableSchema("tEst_Table", - [ColumnSchema("Device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("vAlue", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + table = TableSchema( + "tEst_Table", + [ + ColumnSchema("Device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("vAlue", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) if os.path.exists("test1.tsfile"): os.remove("test1.tsfile") with TsFileTableWriter("test1.tsfile", table) as writer: @@ -509,14 +574,14 @@ def test_tsfile_config(): config_modified = get_tsfile_config() assert config_normal != config_modified assert config_modified["chunk_group_size_threshold_"] == 100 * 100 - set_tsfile_config({'chunk_group_size_threshold_': 100 * 20}) + set_tsfile_config({"chunk_group_size_threshold_": 100 * 20}) assert get_tsfile_config()["chunk_group_size_threshold_"] == 100 * 20 with pytest.raises(TypeError): set_tsfile_config({"time_compress_type_": TSDataType.DOUBLE}) with pytest.raises(TypeError): - set_tsfile_config({'chunk_group_size_threshold_': -1 * 100 * 20}) + set_tsfile_config({"chunk_group_size_threshold_": -1 * 100 * 20}) - set_tsfile_config({'float_encoding_type_': TSEncoding.PLAIN}) + set_tsfile_config({"float_encoding_type_": TSEncoding.PLAIN}) assert get_tsfile_config()["float_encoding_type_"] == TSEncoding.PLAIN with pytest.raises(TypeError): @@ -528,14 +593,21 @@ def test_tsfile_config(): def test_tsfile_to_df(): - table = TableSchema("test_table", - [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("value2", TSDataType.INT64, ColumnCategory.FIELD)]) + table = TableSchema( + "test_table", + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("value2", TSDataType.INT64, ColumnCategory.FIELD), + ], + ) try: with TsFileTableWriter("table_write_to_df.tsfile", table) as writer: - tablet = Tablet(["device", "value", "value2"], - [TSDataType.STRING, TSDataType.DOUBLE, TSDataType.INT64], 4097) + tablet = Tablet( + ["device", "value", "value2"], + [TSDataType.STRING, TSDataType.DOUBLE, TSDataType.INT64], + 4097, + ) for i in range(4097): tablet.add_timestamp(i, i) tablet.add_value_by_name("device", i, "device" + str(i)) @@ -548,10 +620,16 @@ def test_tsfile_to_df(): assert is_integer_dtype(df1[TIME_COLUMN]) assert df1["value"].dtype == Float64Dtype() assert is_integer_dtype(df1["value2"]) - df2 = to_dataframe("table_write_to_df.tsfile", column_names=["device", "value2"]) + df2 = to_dataframe( + "table_write_to_df.tsfile", column_names=["device", "value2"] + ) assert df2.shape == (4097, 3) assert df1["value2"].equals(df2["value2"]) - df3 = to_dataframe("table_write_to_df.tsfile", column_names=["device", "value"], max_row_num=8000) + df3 = to_dataframe( + "table_write_to_df.tsfile", + column_names=["device", "value"], + max_row_num=8000, + ) assert df3.shape == (4097, 3) with pytest.raises(TableNotExistError): to_dataframe("table_write_to_df.tsfile", "test_tb") @@ -610,7 +688,7 @@ def test_tree_all_datatype_query_to_dataframe_variants(): Field("LeveL3", i * 3, TSDataType.INT32), Field("LeveL4", f"string_value_{i}", TSDataType.STRING), Field("LeveL5", f"text_value_{i}", TSDataType.TEXT), - Field("LeveL6", f"blob_data_{i}".encode('utf-8'), TSDataType.BLOB), + Field("LeveL6", f"blob_data_{i}".encode("utf-8"), TSDataType.BLOB), Field("LeveL7", date(2025, 1, i % 20 + 1), TSDataType.DATE), Field("LeveL8", i * 8, TSDataType.TIMESTAMP), Field("LeveL9", i % 2 == 0, TSDataType.BOOLEAN), @@ -645,7 +723,7 @@ def test_tree_all_datatype_query_to_dataframe_variants(): assert df2_5.iloc[i, 3] == f"text_value_{i}" df2_6 = to_dataframe(tsfile_path, column_names=["LeveL6"]) for i in range(max_row_num): - assert df2_6.iloc[i, 3] == f"blob_data_{i}".encode('utf-8') + assert df2_6.iloc[i, 3] == f"blob_data_{i}".encode("utf-8") df2_7 = to_dataframe(tsfile_path, column_names=["LeveL7"]) for i in range(max_row_num): assert df2_7.iloc[i, 3] == date(2025, 1, i % 20 + 1) @@ -682,7 +760,7 @@ def test_tree_all_datatype_query_to_dataframe_variants(): assert df2_12.iloc[i, 5] == np.int32(i * 3) assert df2_12.iloc[i, 6] == f"string_value_{i}" assert df2_12.iloc[i, 7] == f"text_value_{i}" - assert df2_12.iloc[i, 8] == f"blob_data_{i}".encode('utf-8') + assert df2_12.iloc[i, 8] == f"blob_data_{i}".encode("utf-8") assert df2_12.iloc[i, 9] == date(2025, 1, i % 20 + 1) assert df2_12.iloc[i, 10] == np.int64(i * 8) assert df2_12.iloc[i, 11] == (i % 2 == 0) @@ -734,12 +812,12 @@ def test_tree_all_datatype_query_to_dataframe_variants(): row_num = 0 for df6_1 in to_dataframe( - tsfile_path, - column_names=["LeveL1", "LeveL2"], - start_time=-50, - end_time=10, - max_row_num=1, - as_iterator=True, + tsfile_path, + column_names=["LeveL1", "LeveL2"], + start_time=-50, + end_time=10, + max_row_num=1, + as_iterator=True, ): assert df6_1.shape[0] == 1 assert df6_1.iloc[0, 0] == -50 + row_num @@ -763,7 +841,4 @@ def test_tree_all_datatype_query_to_dataframe_variants(): if __name__ == "__main__": os.chdir(os.path.dirname(os.path.abspath(__file__))) - pytest.main([ - "test_write_and_read.py::test_row_record_write_and_read", - "-s", "-v" - ]) + pytest.main(["test_write_and_read.py::test_row_record_write_and_read", "-s", "-v"]) diff --git a/python/tests/test_write_arrow.py b/python/tests/test_write_arrow.py index 5621d22e3..db8b3baf4 100644 --- a/python/tests/test_write_arrow.py +++ b/python/tests/test_write_arrow.py @@ -37,6 +37,7 @@ # Helpers # --------------------------------------------------------------------------- + def _make_schema(table_name, extra_cols): """Build a TableSchema with a string TAG 'device' plus the given field cols.""" return TableSchema( @@ -45,7 +46,9 @@ def _make_schema(table_name, extra_cols): ) -def _read_all_arrow(file_path, table_name, columns, start=0, end=10**18, batch_size=4096): +def _read_all_arrow( + file_path, table_name, columns, start=0, end=10**18, batch_size=4096 +): """Read all rows from file via read_arrow_batch and return as a pa.Table.""" reader = TsFileReader(file_path) rs = reader.query_table( @@ -72,23 +75,29 @@ def _read_all_arrow(file_path, table_name, columns, start=0, end=10**18, batch_s # Basic write + read-back # --------------------------------------------------------------------------- + def test_write_arrow_basic(): """Write 1 000 rows via write_arrow_batch and verify count + values.""" path = "test_write_arrow_basic.tsfile" table_name = "t" n = 1000 - schema = _make_schema(table_name, [ - ColumnSchema("value1", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("value2", TSDataType.DOUBLE, ColumnCategory.FIELD), - ]) + schema = _make_schema( + table_name, + [ + ColumnSchema("value1", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("value2", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) - batch = pa.record_batch({ - "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), - "device": pa.array([f"d{i}" for i in range(n)], type=pa.string()), - "value1": pa.array(np.arange(n, dtype="int64"), type=pa.int64()), - "value2": pa.array(np.arange(n, dtype="float64") * 1.5, type=pa.float64()), - }) + batch = pa.record_batch( + { + "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), + "device": pa.array([f"d{i}" for i in range(n)], type=pa.string()), + "value1": pa.array(np.arange(n, dtype="int64"), type=pa.int64()), + "value2": pa.array(np.arange(n, dtype="float64") * 1.5, type=pa.float64()), + } + ) try: if os.path.exists(path): @@ -111,21 +120,27 @@ def test_write_arrow_basic(): # pa.Table input # --------------------------------------------------------------------------- + def test_write_arrow_from_table(): """write_arrow_batch should accept pa.Table (multi-chunk) as well.""" path = "test_write_arrow_from_table.tsfile" table_name = "t" n = 500 - schema = _make_schema(table_name, [ - ColumnSchema("v", TSDataType.INT32, ColumnCategory.FIELD), - ]) + schema = _make_schema( + table_name, + [ + ColumnSchema("v", TSDataType.INT32, ColumnCategory.FIELD), + ], + ) - tbl = pa.table({ - "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), - "device": pa.array(["dev"] * n, type=pa.string()), - "v": pa.array(np.arange(n, dtype="int32"), type=pa.int32()), - }) + tbl = pa.table( + { + "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), + "device": pa.array(["dev"] * n, type=pa.string()), + "v": pa.array(np.arange(n, dtype="int32"), type=pa.int32()), + } + ) try: if os.path.exists(path): @@ -146,6 +161,7 @@ def test_write_arrow_from_table(): # Multiple batches # --------------------------------------------------------------------------- + def test_write_arrow_multiple_batches(): """Write several batches sequentially and verify the total row count.""" path = "test_write_arrow_multi.tsfile" @@ -154,9 +170,12 @@ def test_write_arrow_multiple_batches(): num_batches = 4 total = rows_per_batch * num_batches - schema = _make_schema(table_name, [ - ColumnSchema("v", TSDataType.INT64, ColumnCategory.FIELD), - ]) + schema = _make_schema( + table_name, + [ + ColumnSchema("v", TSDataType.INT64, ColumnCategory.FIELD), + ], + ) try: if os.path.exists(path): @@ -164,15 +183,23 @@ def test_write_arrow_multiple_batches(): with TsFileTableWriter(path, schema) as w: for b in range(num_batches): start_ts = b * rows_per_batch - batch = pa.record_batch({ - "time": pa.array( - np.arange(start_ts, start_ts + rows_per_batch, dtype="int64"), - type=pa.timestamp("ns")), - "device": pa.array(["dev"] * rows_per_batch, type=pa.string()), - "v": pa.array( - np.arange(start_ts, start_ts + rows_per_batch, dtype="int64"), - type=pa.int64()), - }) + batch = pa.record_batch( + { + "time": pa.array( + np.arange( + start_ts, start_ts + rows_per_batch, dtype="int64" + ), + type=pa.timestamp("ns"), + ), + "device": pa.array(["dev"] * rows_per_batch, type=pa.string()), + "v": pa.array( + np.arange( + start_ts, start_ts + rows_per_batch, dtype="int64" + ), + type=pa.int64(), + ), + } + ) w.write_arrow_batch(batch) result = _read_all_arrow(path, table_name, ["device", "v"]) @@ -186,38 +213,48 @@ def test_write_arrow_multiple_batches(): # All supported data types # --------------------------------------------------------------------------- + def test_write_arrow_all_datatypes(): """Write every supported data type and verify values read back correctly.""" path = "test_write_arrow_all_types.tsfile" table_name = "t" n = 200 - schema = TableSchema(table_name, [ - ColumnSchema("tag", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("bool_col", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("int32_col", TSDataType.INT32, ColumnCategory.FIELD), - ColumnSchema("int64_col", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), - ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("str_col", TSDataType.STRING, ColumnCategory.FIELD), - ColumnSchema("date_col", TSDataType.DATE, ColumnCategory.FIELD), - ]) + schema = TableSchema( + table_name, + [ + ColumnSchema("tag", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("bool_col", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("int32_col", TSDataType.INT32, ColumnCategory.FIELD), + ColumnSchema("int64_col", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), + ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("str_col", TSDataType.STRING, ColumnCategory.FIELD), + ColumnSchema("date_col", TSDataType.DATE, ColumnCategory.FIELD), + ], + ) dates_days = [ (date(2025, 1, (i % 28) + 1) - date(1970, 1, 1)).days for i in range(n) ] - batch = pa.record_batch({ - "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), - "tag": pa.array([f"dev{i}" for i in range(n)], type=pa.string()), - "bool_col": pa.array([i % 2 == 0 for i in range(n)], type=pa.bool_()), - "int32_col": pa.array(np.arange(n, dtype="int32"), type=pa.int32()), - "int64_col": pa.array(np.arange(n, dtype="int64") * 10, type=pa.int64()), - "float_col": pa.array(np.arange(n, dtype="float32") * 0.5, type=pa.float32()), - "double_col": pa.array(np.arange(n, dtype="float64") * 1.1, type=pa.float64()), - "str_col": pa.array([f"s{i}" for i in range(n)], type=pa.string()), - "date_col": pa.array(dates_days, type=pa.date32()), - }) + batch = pa.record_batch( + { + "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), + "tag": pa.array([f"dev{i}" for i in range(n)], type=pa.string()), + "bool_col": pa.array([i % 2 == 0 for i in range(n)], type=pa.bool_()), + "int32_col": pa.array(np.arange(n, dtype="int32"), type=pa.int32()), + "int64_col": pa.array(np.arange(n, dtype="int64") * 10, type=pa.int64()), + "float_col": pa.array( + np.arange(n, dtype="float32") * 0.5, type=pa.float32() + ), + "double_col": pa.array( + np.arange(n, dtype="float64") * 1.1, type=pa.float64() + ), + "str_col": pa.array([f"s{i}" for i in range(n)], type=pa.string()), + "date_col": pa.array(dates_days, type=pa.date32()), + } + ) try: if os.path.exists(path): @@ -226,15 +263,32 @@ def test_write_arrow_all_datatypes(): w.write_arrow_batch(batch) result = _read_all_arrow( - path, table_name, - ["tag", "bool_col", "int32_col", "int64_col", - "float_col", "double_col", "str_col", "date_col"], + path, + table_name, + [ + "tag", + "bool_col", + "int32_col", + "int64_col", + "float_col", + "double_col", + "str_col", + "date_col", + ], ) assert len(result) == n df = result.to_pandas().sort_values("time").reset_index(drop=True) - for col in ["tag", "bool_col", "int32_col", "int64_col", - "float_col", "double_col", "str_col", "date_col"]: + for col in [ + "tag", + "bool_col", + "int32_col", + "int64_col", + "float_col", + "double_col", + "str_col", + "date_col", + ]: assert col in df.columns, f"Column '{col}' missing from result" assert list(df["int32_col"]) == list(range(n)) @@ -252,6 +306,7 @@ def test_write_arrow_all_datatypes(): # Parity with write_dataframe # --------------------------------------------------------------------------- + def test_write_arrow_parity_with_dataframe(): """Data written via write_arrow_batch must match data written via write_dataframe.""" arrow_path = "test_write_arrow_parity_arrow.tsfile" @@ -259,45 +314,55 @@ def test_write_arrow_parity_with_dataframe(): table_name = "t" n = 500 - schema_arrow = TableSchema(table_name, [ - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("v_i32", TSDataType.INT32, ColumnCategory.FIELD), - ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), - ]) - schema_df = TableSchema(table_name, [ - ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("v_i32", TSDataType.INT32, ColumnCategory.FIELD), - ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), - ]) + schema_arrow = TableSchema( + table_name, + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("v_i32", TSDataType.INT32, ColumnCategory.FIELD), + ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), + ], + ) + schema_df = TableSchema( + table_name, + [ + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("v_i32", TSDataType.INT32, ColumnCategory.FIELD), + ColumnSchema("v_f64", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("v_bool", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("v_str", TSDataType.STRING, ColumnCategory.FIELD), + ], + ) timestamps = np.arange(n, dtype="int64") - v_i32 = np.arange(n, dtype="int32") - v_f64 = np.arange(n, dtype="float64") * 2.5 + v_i32 = np.arange(n, dtype="int32") + v_f64 = np.arange(n, dtype="float64") * 2.5 v_bool = np.array([i % 3 == 0 for i in range(n)]) - v_str = [f"row{i}" for i in range(n)] + v_str = [f"row{i}" for i in range(n)] device = ["dev"] * n - batch = pa.record_batch({ - "time": pa.array(timestamps, type=pa.timestamp("ns")), - "device": pa.array(device, type=pa.string()), - "v_i32": pa.array(v_i32, type=pa.int32()), - "v_f64": pa.array(v_f64, type=pa.float64()), - "v_bool": pa.array(v_bool, type=pa.bool_()), - "v_str": pa.array(v_str, type=pa.string()), - }) - - dataframe = pd.DataFrame({ - "time": pd.Series(timestamps, dtype="int64"), - "device": device, - "v_i32": pd.Series(v_i32, dtype="int32"), - "v_f64": pd.Series(v_f64, dtype="float64"), - "v_bool": pd.Series(v_bool, dtype="bool"), - "v_str": v_str, - }) + batch = pa.record_batch( + { + "time": pa.array(timestamps, type=pa.timestamp("ns")), + "device": pa.array(device, type=pa.string()), + "v_i32": pa.array(v_i32, type=pa.int32()), + "v_f64": pa.array(v_f64, type=pa.float64()), + "v_bool": pa.array(v_bool, type=pa.bool_()), + "v_str": pa.array(v_str, type=pa.string()), + } + ) + + dataframe = pd.DataFrame( + { + "time": pd.Series(timestamps, dtype="int64"), + "device": device, + "v_i32": pd.Series(v_i32, dtype="int32"), + "v_f64": pd.Series(v_f64, dtype="float64"), + "v_bool": pd.Series(v_bool, dtype="bool"), + "v_str": v_str, + } + ) cols = ["device", "v_i32", "v_f64", "v_bool", "v_str"] @@ -312,18 +377,20 @@ def test_write_arrow_parity_with_dataframe(): w.write_dataframe(dataframe) result_arrow = _read_all_arrow(arrow_path, table_name, cols).to_pandas() - result_df = _read_all_arrow(df_path, table_name, cols).to_pandas() + result_df = _read_all_arrow(df_path, table_name, cols).to_pandas() result_arrow = result_arrow.sort_values("time").reset_index(drop=True) - result_df = result_df.sort_values("time").reset_index(drop=True) + result_df = result_df.sort_values("time").reset_index(drop=True) assert len(result_arrow) == len(result_df) == n - assert list(result_arrow["v_i32"]) == list(result_df["v_i32"]) - assert list(result_arrow["v_str"]) == list(result_df["v_str"]) + assert list(result_arrow["v_i32"]) == list(result_df["v_i32"]) + assert list(result_arrow["v_str"]) == list(result_df["v_str"]) assert list(result_arrow["v_bool"]) == list(result_df["v_bool"]) for i in range(n): - assert abs(result_arrow["v_f64"].iloc[i] - result_df["v_f64"].iloc[i]) < 1e-9 + assert ( + abs(result_arrow["v_f64"].iloc[i] - result_df["v_f64"].iloc[i]) < 1e-9 + ) finally: for p in (arrow_path, df_path): if os.path.exists(p): @@ -334,21 +401,27 @@ def test_write_arrow_parity_with_dataframe(): # Large batch # --------------------------------------------------------------------------- + def test_write_arrow_large_batch(): """Write a single large batch (100 k rows) and verify row count.""" path = "test_write_arrow_large.tsfile" table_name = "t" n = 100_000 - schema = _make_schema(table_name, [ - ColumnSchema("v", TSDataType.DOUBLE, ColumnCategory.FIELD), - ]) + schema = _make_schema( + table_name, + [ + ColumnSchema("v", TSDataType.DOUBLE, ColumnCategory.FIELD), + ], + ) - batch = pa.record_batch({ - "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), - "device": pa.array(["d"] * n, type=pa.string()), - "v": pa.array(np.random.rand(n), type=pa.float64()), - }) + batch = pa.record_batch( + { + "time": pa.array(np.arange(n, dtype="int64"), type=pa.timestamp("ns")), + "device": pa.array(["d"] * n, type=pa.string()), + "v": pa.array(np.random.rand(n), type=pa.float64()), + } + ) try: if os.path.exists(path): diff --git a/python/tsfile/__init__.py b/python/tsfile/__init__.py index 31390d9c2..eb1840f83 100644 --- a/python/tsfile/__init__.py +++ b/python/tsfile/__init__.py @@ -37,8 +37,19 @@ from .date_utils import * from .exceptions import * from .tsfile_reader import TsFileReaderPy as TsFileReader, ResultSetPy as ResultSet -from .tag_filter import (TagFilter, tag_eq, tag_neq, tag_lt, tag_lteq, tag_gt, tag_gteq, - tag_regexp, tag_not_regexp, tag_between, tag_not_between) +from .tag_filter import ( + TagFilter, + tag_eq, + tag_neq, + tag_lt, + tag_lteq, + tag_gt, + tag_gteq, + tag_regexp, + tag_not_regexp, + tag_between, + tag_not_between, +) from .tsfile_writer import TsFileWriterPy as TsFileWriter from .tsfile_py_cpp import get_tsfile_config, set_tsfile_config from .tsfile_table_writer import TsFileTableWriter diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py index 18da3aef7..917800f8c 100644 --- a/python/tsfile/constants.py +++ b/python/tsfile/constants.py @@ -21,6 +21,7 @@ TIME_COLUMN = "time" + @unique class TSDataType(IntEnum): BOOLEAN = 0 @@ -34,7 +35,7 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 - def is_compatible_with(self, other: 'TSDataType') -> bool: + def is_compatible_with(self, other: "TSDataType") -> bool: if self == other: return True return other in _TSDATATYPE_COMPATIBLE_SOURCES.get(self, ()) @@ -101,12 +102,13 @@ def from_pandas_datatype(cls, dtype): try: import pandas as pd - if hasattr(pd, 'StringDtype') and isinstance(dtype, pd.StringDtype): + + if hasattr(pd, "StringDtype") and isinstance(dtype, pd.StringDtype): return cls.STRING except (ImportError, AttributeError): pass - if hasattr(dtype, 'type'): + if hasattr(dtype, "type"): dtype = dtype.type if dtype is np.bool_: return cls.BOOLEAN @@ -123,21 +125,21 @@ def from_pandas_datatype(cls, dtype): dtype_str = str(dtype) - if 'stringdtype' in dtype_str.lower() or dtype_str.startswith('string'): + if "stringdtype" in dtype_str.lower() or dtype_str.startswith("string"): return cls.STRING dtype_map = { - 'bool': cls.BOOLEAN, - 'boolean': cls.BOOLEAN, - 'int32': cls.INT32, - 'Int32': cls.INT32, - 'int64': cls.INT64, - 'Int64': cls.INT64, - 'float32': cls.FLOAT, - 'float64': cls.DOUBLE, - 'bytes': cls.BLOB, - 'object': cls.STRING, - 'string': cls.STRING, + "bool": cls.BOOLEAN, + "boolean": cls.BOOLEAN, + "int32": cls.INT32, + "Int32": cls.INT32, + "int64": cls.INT64, + "Int64": cls.INT64, + "float32": cls.FLOAT, + "float64": cls.DOUBLE, + "bytes": cls.BLOB, + "object": cls.STRING, + "string": cls.STRING, } if dtype_str in dtype_map: @@ -147,10 +149,10 @@ def from_pandas_datatype(cls, dtype): if dtype_lower in dtype_map: return dtype_map[dtype_lower] - if 'object_' in dtype_lower or dtype_str == "": + if "object_" in dtype_lower or dtype_str == "": return cls.STRING - if dtype_str.startswith('datetime64'): + if dtype_str.startswith("datetime64"): return cls.TIMESTAMP return cls.STRING @@ -161,7 +163,7 @@ def from_pandas_datatype(cls, dtype): TSDataType.STRING: (TSDataType.TEXT,), TSDataType.TEXT: (TSDataType.STRING,), TSDataType.DOUBLE: (TSDataType.FLOAT,), - TSDataType.TIMESTAMP: (TSDataType.INT64, TSDataType.INT32) + TSDataType.TIMESTAMP: (TSDataType.INT64, TSDataType.INT32), } diff --git a/python/tsfile/dataset/dataframe.py b/python/tsfile/dataset/dataframe.py index e28a0e5ed..17127cf69 100644 --- a/python/tsfile/dataset/dataframe.py +++ b/python/tsfile/dataset/dataframe.py @@ -28,7 +28,12 @@ import numpy as np from .formatting import format_dataframe_table -from .metadata import TableEntry, _coerce_path_component, build_logical_series_path, split_logical_series_path +from .metadata import ( + TableEntry, + _coerce_path_component, + build_logical_series_path, + split_logical_series_path, +) from .merge import build_aligned_matrix, merge_time_value_parts, merge_timestamp_parts from .timeseries import AlignedTimeseries, Timeseries @@ -104,7 +109,9 @@ def _series_lookup_hint(name: str) -> str: return f"Series not found: '{name}'. Use df.list_timeseries() to inspect available series." -def _validate_table_schema(existing: TableEntry, incoming: TableEntry, file_path: str) -> None: +def _validate_table_schema( + existing: TableEntry, incoming: TableEntry, file_path: str +) -> None: """Reject same-name tables whose tag/field layout differs across shards.""" if ( existing.tag_columns == incoming.tag_columns @@ -180,7 +187,9 @@ def _merge_field_timestamps(series_name: str, refs: List[SeriesRef]) -> np.ndarr # [Temporary] It will be replaced by query_by_row interface in TsFile time_parts = [] for reader, device_id, field_idx in refs: - ts_arr, _ = reader.read_series_by_ref(device_id, field_idx, _QUERY_START, _QUERY_END) + ts_arr, _ = reader.read_series_by_ref( + device_id, field_idx, _QUERY_START, _QUERY_END + ) if len(ts_arr) > 0: time_parts.append(ts_arr) @@ -237,7 +246,9 @@ def __init__(self, dataframe: "TsFileDataFrame"): def _parse_key(self, key): if not isinstance(key, tuple) or len(key) != 2: - raise ValueError("loc requires exactly 2 arguments: tsdf.loc[start_time:end_time, series_list]") + raise ValueError( + "loc requires exactly 2 arguments: tsdf.loc[start_time:end_time, series_list]" + ) time_slice, series_spec = key if isinstance(time_slice, slice): @@ -264,27 +275,48 @@ def _parse_key(self, key): elif isinstance(item, str): series_ref = self._df._resolve_series_name(item) else: - raise TypeError(f"Series specifier must be int or str, got {type(item)}") + raise TypeError( + f"Series specifier must be int or str, got {type(item)}" + ) series_refs.append(series_ref) series_names.append(self._df._build_series_name(series_ref)) return start_time, end_time, series_refs, series_names - def _query_aligned(self, start_time: int, end_time: int, series_refs: List[SeriesRefKey], series_names: List[str]): + def _query_aligned( + self, + start_time: int, + end_time: int, + series_refs: List[SeriesRefKey], + series_names: List[str], + ): """Batch aligned reads by reader/device, then merge per-series fragments.""" self._df._assert_open() groups = defaultdict(list) for col_idx, series_ref in enumerate(series_refs): device_idx, field_idx = series_ref device_info = self._df._cache.devices[device_idx] - if device_info["max_time"] is None or device_info["max_time"] < start_time or device_info["min_time"] > end_time: + if ( + device_info["max_time"] is None + or device_info["max_time"] < start_time + or device_info["min_time"] > end_time + ): continue _, table_entry, _ = self._df._get_series_components(series_ref) field_name = table_entry.field_columns[field_idx] - for reader, device_id, reader_field_idx in self._df._index.series_ref_map[series_ref]: + for reader, device_id, reader_field_idx in self._df._index.series_ref_map[ + series_ref + ]: groups[(id(reader), device_id)].append( - (col_idx, reader_field_idx, field_name, series_names[col_idx], reader, device_id) + ( + col_idx, + reader_field_idx, + field_name, + series_names[col_idx], + reader, + device_id, + ) ) series_time_parts = defaultdict(list) @@ -293,7 +325,9 @@ def _query_aligned(self, start_time: int, end_time: int, series_refs: List[Serie reader = entries[0][4] device_id = entries[0][5] field_indices = list(dict.fromkeys(entry[1] for entry in entries)) - ts_arr, field_vals = reader.read_device_fields_by_time_range(device_id, field_indices, start_time, end_time) + ts_arr, field_vals = reader.read_device_fields_by_time_range( + device_id, field_indices, start_time, end_time + ) for _, _, field_name, series_name, _, _ in entries: if len(ts_arr) > 0: series_time_parts[series_name].append(ts_arr) @@ -301,13 +335,17 @@ def _query_aligned(self, start_time: int, end_time: int, series_refs: List[Serie series_data = {} for name in series_names: - series_data[name] = merge_time_value_parts(series_time_parts[name], series_value_parts[name]) + series_data[name] = merge_time_value_parts( + series_time_parts[name], series_value_parts[name] + ) return build_aligned_matrix(series_names, series_data) def __getitem__(self, key) -> AlignedTimeseries: start_time, end_time, series_refs, series_names = self._parse_key(key) - timestamps, values = self._query_aligned(start_time, end_time, series_refs, series_names) + timestamps, values = self._query_aligned( + start_time, end_time, series_refs, series_names + ) return AlignedTimeseries(timestamps, values, series_names) @@ -326,7 +364,9 @@ def __init__(self, paths: Union[str, List[str]], show_progress: bool = True): self._load_metadata() @classmethod - def _from_subset(cls, parent: "TsFileDataFrame", series_refs: List[SeriesRefKey]) -> "TsFileDataFrame": + def _from_subset( + cls, parent: "TsFileDataFrame", series_refs: List[SeriesRefKey] + ) -> "TsFileDataFrame": """Create a lightweight view that reuses the parent's readers and caches.""" obj = object.__new__(cls) obj._root = parent._root if parent._is_view else parent @@ -343,7 +383,9 @@ def _from_subset(cls, parent: "TsFileDataFrame", series_refs: List[SeriesRefKey] series_ref_map=parent._index.series_ref_map, series_ref_set=set(series_refs), ) - obj._cache = _DerivedCache(devices=parent._cache.devices, field_stats=parent._cache.field_stats) + obj._cache = _DerivedCache( + devices=parent._cache.devices, field_stats=parent._cache.field_stats + ) obj._closed = False return obj @@ -363,9 +405,13 @@ def _load_metadata(self): else: self._load_metadata_serial(TsFileSeriesReader) - self._cache.devices = [_build_device_entry(refs) for refs in self._index.device_refs] + self._cache.devices = [ + _build_device_entry(refs) for refs in self._index.device_refs + ] for series_ref in self._index.series_refs_ordered: - self._cache.field_stats[series_ref] = _build_field_stats(self._index.series_ref_map[series_ref]) + self._cache.field_stats[series_ref] = _build_field_stats( + self._index.series_ref_map[series_ref] + ) self._index.series_ref_set = set(self._index.series_refs_ordered) if not self._index.series_refs_ordered: @@ -387,7 +433,9 @@ def open_file(file_path): return file_path, reader_class(file_path, show_progress=False) total = len(self._paths) - with ThreadPoolExecutor(max_workers=min(total, os.cpu_count() or 4)) as executor: + with ThreadPoolExecutor( + max_workers=min(total, os.cpu_count() or 4) + ) as executor: futures = {executor.submit(open_file, path): path for path in self._paths} results = {} done = 0 @@ -401,7 +449,9 @@ def open_file(file_path): if self._show_progress and total > 0: total_series = sum(reader.series_count for reader in results.values()) - sys.stderr.write(f"\rLoading TsFile shards: {total}/{total} ({total_series} series) ... done\n") + sys.stderr.write( + f"\rLoading TsFile shards: {total}/{total} ({total_series} series) ... done\n" + ) sys.stderr.flush() for file_path in self._paths: @@ -412,7 +462,9 @@ def open_file(file_path): results[file_path], ) - def _get_series_components(self, series_ref: SeriesRefKey) -> Tuple[DeviceKey, TableEntry, int]: + def _get_series_components( + self, series_ref: SeriesRefKey + ) -> Tuple[DeviceKey, TableEntry, int]: device_idx, field_idx = series_ref device_key = self._index.device_order[device_idx] return device_key, self._index.table_entries[device_key[0]], field_idx @@ -478,11 +530,16 @@ def __len__(self) -> int: return len(self._index.series_refs_ordered) def list_timeseries(self, path_prefix: str = "") -> List[str]: - names = [self._build_series_name(series_ref) for series_ref in self._index.series_refs_ordered] + names = [ + self._build_series_name(series_ref) + for series_ref in self._index.series_refs_ordered + ] if not path_prefix: return names prefix = path_prefix if path_prefix.endswith(".") else path_prefix + "." - return [name for name in names if name.startswith(prefix) or name == path_prefix] + return [ + name for name in names if name.startswith(prefix) or name == path_prefix + ] def _get_timeseries(self, series_ref: SeriesRefKey) -> Timeseries: self._assert_open() @@ -492,7 +549,9 @@ def _get_timeseries(self, series_ref: SeriesRefKey) -> Timeseries: self._index.series_ref_map[series_ref], self._cache.field_stats[series_ref], self._assert_open, - lambda: _merge_field_timestamps(series_name, self._index.series_ref_map[series_ref]), + lambda: _merge_field_timestamps( + series_name, self._index.series_ref_map[series_ref] + ), ) def __getitem__(self, key): @@ -500,7 +559,9 @@ def __getitem__(self, key): import pandas as pd if isinstance(key, pd.Series) and key.dtype == bool: - selected = [self._index.series_refs_ordered[idx] for idx in key.index[key]] + selected = [ + self._index.series_refs_ordered[idx] for idx in key.index[key] + ] return TsFileDataFrame._from_subset(self, selected) except ImportError: pass @@ -510,7 +571,9 @@ def __getitem__(self, key): if idx < 0: idx += len(self._index.series_refs_ordered) if idx < 0 or idx >= len(self._index.series_refs_ordered): - raise IndexError(f"Index {idx} out of range [0, {len(self._index.series_refs_ordered)})") + raise IndexError( + f"Index {idx} out of range [0, {len(self._index.series_refs_ordered)})" + ) return self._get_timeseries(self._index.series_refs_ordered[idx]) if isinstance(key, str): @@ -546,19 +609,26 @@ def __getitem__(self, key): if isinstance(key, slice): return TsFileDataFrame._from_subset( self, - [self._index.series_refs_ordered[idx] for idx in range(*key.indices(len(self._index.series_refs_ordered)))], + [ + self._index.series_refs_ordered[idx] + for idx in range(*key.indices(len(self._index.series_refs_ordered))) + ], ) if isinstance(key, list): selected = [] for item in key: if not isinstance(item, (int, np.integer)): - raise TypeError(f"List index must contain integers, got {type(item)}") + raise TypeError( + f"List index must contain integers, got {type(item)}" + ) idx = int(item) if idx < 0: idx += len(self._index.series_refs_ordered) if idx < 0 or idx >= len(self._index.series_refs_ordered): - raise IndexError(f"Index {item} out of range [0, {len(self._index.series_refs_ordered)})") + raise IndexError( + f"Index {item} out of range [0, {len(self._index.series_refs_ordered)})" + ) selected.append(self._index.series_refs_ordered[idx]) return TsFileDataFrame._from_subset(self, selected) diff --git a/python/tsfile/dataset/formatting.py b/python/tsfile/dataset/formatting.py index 5e01bb39b..f484032c8 100644 --- a/python/tsfile/dataset/formatting.py +++ b/python/tsfile/dataset/formatting.py @@ -67,7 +67,9 @@ def format_aligned_timeseries( col_widths = [] rendered_values = [] # list of dicts: row_idx -> cell string for col_idx in range(n_cols): - col_name = series_names[col_idx] if col_idx < len(series_names) else f"col_{col_idx}" + col_name = ( + series_names[col_idx] if col_idx < len(series_names) else f"col_{col_idx}" + ) width = len(col_name) column = {} for row_idx in show_indices: @@ -80,7 +82,9 @@ def format_aligned_timeseries( header = ["time".rjust(ts_width)] for col_idx, width in enumerate(col_widths): - col_name = series_names[col_idx] if col_idx < len(series_names) else f"col_{col_idx}" + col_name = ( + series_names[col_idx] if col_idx < len(series_names) else f"col_{col_idx}" + ) header.append(col_name.rjust(width)) lines = [" ".join(header)] @@ -155,7 +159,10 @@ def format_dataframe_table( for row_idx, row in enumerate(rows): if truncated and row_idx == split: lines.append("...") - parts = [str(row["index"]).rjust(widths[""]), row["table"].rjust(widths["table"])] + parts = [ + str(row["index"]).rjust(widths[""]), + row["table"].rjust(widths["table"]), + ] for tag_col in tag_columns: parts.append(str(row[tag_col]).rjust(widths[tag_col])) parts.extend( diff --git a/python/tsfile/dataset/merge.py b/python/tsfile/dataset/merge.py index 1f72290ea..8d70dc552 100644 --- a/python/tsfile/dataset/merge.py +++ b/python/tsfile/dataset/merge.py @@ -44,7 +44,9 @@ def merge_timestamp_parts( return parts[0] parts.sort(key=lambda ts_part: int(ts_part[0])) - if all(int(parts[idx - 1][-1]) < int(parts[idx][0]) for idx in range(1, len(parts))): + if all( + int(parts[idx - 1][-1]) < int(parts[idx][0]) for idx in range(1, len(parts)) + ): return np.concatenate(parts) total_length = sum(len(ts_part) for ts_part in parts) @@ -71,7 +73,9 @@ def merge_timestamp_parts( next_offset = offset + 1 if next_offset < len(parts[part_idx]): - heapq.heappush(heap, (int(parts[part_idx][next_offset]), part_idx, next_offset)) + heapq.heappush( + heap, (int(parts[part_idx][next_offset]), part_idx, next_offset) + ) if validate_unique: return merged[:out_idx] @@ -93,7 +97,11 @@ def merge_time_value_parts( order after sorting parts by their first timestamp. Fallback: use a k-way merge for overlapping-but-disjoint ranges. """ - parts = [(ts_part, val_part) for ts_part, val_part in zip(time_parts, value_parts) if len(ts_part) > 0] + parts = [ + (ts_part, val_part) + for ts_part, val_part in zip(time_parts, value_parts) + if len(ts_part) > 0 + ] if not parts: return np.array([], dtype=np.int64), np.array([], dtype=np.float64) if len(parts) == 1: @@ -103,14 +111,19 @@ def merge_time_value_parts( time_parts = [ts_part for ts_part, _ in parts] value_parts = [val_part for _, val_part in parts] - if all(int(time_parts[idx - 1][-1]) < int(time_parts[idx][0]) for idx in range(1, len(time_parts))): + if all( + int(time_parts[idx - 1][-1]) < int(time_parts[idx][0]) + for idx in range(1, len(time_parts)) + ): return np.concatenate(time_parts), np.concatenate(value_parts) total_length = sum(len(ts_part) for ts_part in time_parts) merged_ts = np.empty(total_length, dtype=np.int64) merged_vals = np.empty(total_length, dtype=np.float64) - heap = [(int(ts_part[0]), part_idx, 0) for part_idx, ts_part in enumerate(time_parts)] + heap = [ + (int(ts_part[0]), part_idx, 0) for part_idx, ts_part in enumerate(time_parts) + ] heapq.heapify(heap) out_idx = 0 @@ -122,7 +135,9 @@ def merge_time_value_parts( next_offset = offset + 1 if next_offset < len(time_parts[part_idx]): - heapq.heappush(heap, (int(time_parts[part_idx][next_offset]), part_idx, next_offset)) + heapq.heappush( + heap, (int(time_parts[part_idx][next_offset]), part_idx, next_offset) + ) return merged_ts, merged_vals diff --git a/python/tsfile/dataset/metadata.py b/python/tsfile/dataset/metadata.py index 5c4611e06..1616cfc53 100644 --- a/python/tsfile/dataset/metadata.py +++ b/python/tsfile/dataset/metadata.py @@ -41,11 +41,15 @@ class TableEntry: _field_index_by_name: Dict[str, int] = field(init=False, repr=False) def __post_init__(self): - self._field_index_by_name = {column: idx for idx, column in enumerate(self.field_columns)} + self._field_index_by_name = { + column: idx for idx, column in enumerate(self.field_columns) + } def get_field_index(self, field_name: str) -> int: if field_name not in self._field_index_by_name: - raise ValueError(f"Field not found in table '{self.table_name}': {field_name}") + raise ValueError( + f"Field not found in table '{self.table_name}': {field_name}" + ) return self._field_index_by_name[field_name] @@ -92,7 +96,9 @@ def add_table( self.table_id_by_name[table_name] = table_id return table_id - def add_device(self, table_id: int, tag_values: tuple, timestamps: np.ndarray) -> int: + def add_device( + self, table_id: int, tag_values: tuple, timestamps: np.ndarray + ) -> int: key = (table_id, tuple(tag_values)) if key in self.device_id_by_key: return self.device_id_by_key[key] @@ -117,11 +123,18 @@ def add_device(self, table_id: int, tag_values: tuple, timestamps: np.ndarray) - @property def series_count(self) -> int: - return sum(len(self.table_entries[device.table_id].field_columns) for device in self.device_entries) + return sum( + len(self.table_entries[device.table_id].field_columns) + for device in self.device_entries + ) def _escape_path_component(value: Any) -> str: - return str(value).replace(_PATH_ESCAPE, _PATH_ESCAPE * 2).replace(_PATH_SEPARATOR, _PATH_ESCAPE + _PATH_SEPARATOR) + return ( + str(value) + .replace(_PATH_ESCAPE, _PATH_ESCAPE * 2) + .replace(_PATH_SEPARATOR, _PATH_ESCAPE + _PATH_SEPARATOR) + ) def split_logical_series_path(series_path: str) -> List[str]: @@ -150,9 +163,13 @@ def split_logical_series_path(series_path: str) -> List[str]: return parts -def build_logical_series_path(table_name: str, tag_values: Iterable[Any], field_name: str) -> str: +def build_logical_series_path( + table_name: str, tag_values: Iterable[Any], field_name: str +) -> str: components = [table_name, *tag_values, field_name] - return _PATH_SEPARATOR.join(_escape_path_component(component) for component in components) + return _PATH_SEPARATOR.join( + _escape_path_component(component) for component in components + ) def build_series_path(catalog: MetadataCatalog, device_id: int, field_idx: int) -> str: @@ -160,7 +177,9 @@ def build_series_path(catalog: MetadataCatalog, device_id: int, field_idx: int) device_entry = catalog.device_entries[device_id] table_entry = catalog.table_entries[device_entry.table_id] field_name = table_entry.field_columns[field_idx] - return build_logical_series_path(table_entry.table_name, device_entry.tag_values, field_name) + return build_logical_series_path( + table_entry.table_name, device_entry.tag_values, field_name + ) def iter_series_refs(catalog: MetadataCatalog) -> Iterator[Tuple[int, int]]: @@ -177,7 +196,9 @@ def iter_series_paths(catalog: MetadataCatalog) -> Iterator[str]: yield build_series_path(catalog, device_id, field_idx) -def resolve_series_path(catalog: MetadataCatalog, series_path: str) -> Tuple[int, int, int]: +def resolve_series_path( + catalog: MetadataCatalog, series_path: str +) -> Tuple[int, int, int]: """Resolve an external path to ``(table_id, device_id, field_idx)``.""" parts = split_logical_series_path(series_path) if len(parts) < 2: @@ -220,7 +241,12 @@ def _coerce_path_component(value: str, data_type: TSDataType) -> Any: if lowered == "false": return False raise ValueError(f"Invalid boolean tag value: {value}") - if data_type in {TSDataType.INT32, TSDataType.INT64, TSDataType.TIMESTAMP, TSDataType.DATE}: + if data_type in { + TSDataType.INT32, + TSDataType.INT64, + TSDataType.TIMESTAMP, + TSDataType.DATE, + }: return int(value) if data_type in {TSDataType.FLOAT, TSDataType.DOUBLE}: return float(value) diff --git a/python/tsfile/dataset/reader.py b/python/tsfile/dataset/reader.py index d73580237..a09cfc2da 100644 --- a/python/tsfile/dataset/reader.py +++ b/python/tsfile/dataset/reader.py @@ -27,7 +27,12 @@ from ..constants import ColumnCategory, TSDataType from ..tsfile_reader import TsFileReaderPy -from .metadata import MetadataCatalog, build_series_path, iter_series_refs, resolve_series_path +from .metadata import ( + MetadataCatalog, + build_series_path, + iter_series_refs, + resolve_series_path, +) _NUMERIC_FIELD_TYPES = { @@ -83,7 +88,9 @@ def iter_series_paths(self) -> Iterator[str]: def iter_series_refs(self) -> Iterator[Tuple[str, int, int]]: for device_id, field_idx in iter_series_refs(self._catalog): - yield build_series_path(self._catalog, device_id, field_idx), device_id, field_idx + yield build_series_path( + self._catalog, device_id, field_idx + ), device_id, field_idx def close(self): if hasattr(self, "_reader"): @@ -137,14 +144,18 @@ def _cache_metadata_table_model(self): if not field_columns: continue - table_id = self._catalog.add_table(table_name, tag_columns, tag_types, field_columns) + table_id = self._catalog.add_table( + table_name, tag_columns, tag_types, field_columns + ) time_arrays = [] tag_arrays = {tag_column: [] for tag_column in tag_columns} # [Temporary] It will be replaced by new tsfile api, we won't query all the data later. query_columns = tag_columns + field_columns - with self._reader.query_table(table_name, query_columns, batch_size=65536) as result_set: + with self._reader.query_table( + table_name, query_columns, batch_size=65536 + ) as result_set: while True: arrow_table = result_set.read_arrow_batch() if arrow_table is None: @@ -153,7 +164,9 @@ def _cache_metadata_table_model(self): total_rows += batch_rows time_arrays.append(arrow_table.column("time").to_numpy()) for tag_column in tag_columns: - tag_arrays[tag_column].append(arrow_table.column(tag_column).to_numpy()) + tag_arrays[tag_column].append( + arrow_table.column(tag_column).to_numpy() + ) if self.show_progress: sys.stderr.write( @@ -170,7 +183,9 @@ def _cache_metadata_table_model(self): self._add_device(table_id, (), timestamps) continue - for tag_values, device_timestamps in self._iter_device_groups(tag_columns, timestamps, tag_arrays): + for tag_values, device_timestamps in self._iter_device_groups( + tag_columns, timestamps, tag_arrays + ): self._add_device(table_id, tag_values, device_timestamps) if self.show_progress and total_rows > 0: @@ -190,7 +205,9 @@ def _iter_device_groups( tag_arrays: Dict[str, list], ) -> Iterator[Tuple[tuple, np.ndarray]]: """Group one table's rows by tag tuple while preserving original row membership.""" - tag_values_by_column = {column: np.concatenate(tag_arrays[column]) for column in tag_columns} + tag_values_by_column = { + column: np.concatenate(tag_arrays[column]) for column in tag_columns + } n = len(timestamps) arrays = [tag_values_by_column[col] for col in tag_columns] @@ -199,12 +216,16 @@ def _iter_device_groups( for i, col in enumerate(tag_columns): composite[col] = arrays[i] - _, inverse, counts = np.unique(composite, return_inverse=True, return_counts=True) + _, inverse, counts = np.unique( + composite, return_inverse=True, return_counts=True + ) ordered_indices = np.argsort(inverse, kind="stable") group_bounds = np.cumsum(counts)[:-1] for group_indices in np.split(ordered_indices, group_bounds): first = int(group_indices[0]) - tag_tuple = tuple(_to_python_scalar(composite[col][first]) for col in tag_columns) + tag_tuple = tuple( + _to_python_scalar(composite[col][first]) for col in tag_columns + ) yield tag_tuple, timestamps[group_indices] def _add_device( @@ -245,7 +266,9 @@ def get_device_timestamps(self, device_id: int) -> np.ndarray: return self._catalog.device_entries[device_id].timestamps def get_series_info_by_ref(self, device_id: int, field_idx: int) -> dict: - table_entry, device_entry, field_name = self._resolve_series_ref(device_id, field_idx) + table_entry, device_entry, field_name = self._resolve_series_ref( + device_id, field_idx + ) return { "length": device_entry.length, "min_time": device_entry.min_time, @@ -266,14 +289,20 @@ def get_series_timestamps(self, series_path: str) -> np.ndarray: device_id = self._resolve_series_path(series_path)[1] return self.get_device_timestamps(device_id) - def read_series_by_ref(self, device_id: int, field_idx: int, start_time: int, end_time: int) -> Tuple[np.ndarray, np.ndarray]: + def read_series_by_ref( + self, device_id: int, field_idx: int, start_time: int, end_time: int + ) -> Tuple[np.ndarray, np.ndarray]: table_entry, _, field_name = self._resolve_series_ref(device_id, field_idx) - timestamps, field_values = self.read_device_fields_by_time_range(device_id, [field_idx], start_time, end_time) + timestamps, field_values = self.read_device_fields_by_time_range( + device_id, [field_idx], start_time, end_time + ) if len(timestamps) == 0: return np.array([], dtype=np.int64), np.array([], dtype=np.float64) return timestamps, field_values[field_name] - def read_series_by_time_range(self, series_path: str, start_time: int, end_time: int) -> Tuple[np.ndarray, np.ndarray]: + def read_series_by_time_range( + self, series_path: str, start_time: int, end_time: int + ) -> Tuple[np.ndarray, np.ndarray]: _, device_id, field_idx = self._resolve_series_path(series_path) return self.read_series_by_ref(device_id, field_idx, start_time, end_time) @@ -283,7 +312,9 @@ def read_device_fields_by_time_range( """Read one device slice and return the requested field columns keyed by field name.""" device_entry = self._catalog.device_entries[device_id] table_entry = self._catalog.table_entries[device_entry.table_id] - requested_field_columns = [table_entry.field_columns[field_idx] for field_idx in field_indices] + requested_field_columns = [ + table_entry.field_columns[field_idx] for field_idx in field_indices + ] timestamps, field_values = self._read_arrow( table_entry.table_name, requested_field_columns, @@ -306,7 +337,9 @@ def _read_arrow( """Execute the underlying table query, then apply tag filtering client-side.""" tag_columns = list(tag_columns) field_columns = list(field_columns) - query_columns = tag_columns + field_columns if tag_columns else list(field_columns) + query_columns = ( + tag_columns + field_columns if tag_columns else list(field_columns) + ) timestamp_parts = [] field_parts = {field_column: [] for field_column in field_columns} @@ -325,8 +358,12 @@ def _read_arrow( if tag_values: mask = None for tag_column, tag_value in tag_values.items(): - column_mask = pc.equal(arrow_table.column(tag_column), tag_value) - mask = column_mask if mask is None else pc.and_(mask, column_mask) + column_mask = pc.equal( + arrow_table.column(tag_column), tag_value + ) + mask = ( + column_mask if mask is None else pc.and_(mask, column_mask) + ) arrow_table = arrow_table.filter(mask) if arrow_table.num_rows == 0: @@ -336,7 +373,9 @@ def _read_arrow( for field_column in field_columns: raw_values = arrow_table.column(field_column).to_numpy() try: - field_parts[field_column].append(np.asarray(raw_values, dtype=np.float64)) + field_parts[field_column].append( + np.asarray(raw_values, dtype=np.float64) + ) except (TypeError, ValueError) as e: raise TypeError( f"Field column '{field_column}' in table '{table_name}' is not numeric-compatible." @@ -345,10 +384,16 @@ def _read_arrow( if not timestamp_parts: return ( np.array([], dtype=np.int64), - {field_column: np.array([], dtype=np.float64) for field_column in field_columns}, + { + field_column: np.array([], dtype=np.float64) + for field_column in field_columns + }, ) return ( np.concatenate(timestamp_parts).astype(np.int64), - {field_column: np.concatenate(field_parts[field_column]) for field_column in field_columns}, + { + field_column: np.concatenate(field_parts[field_column]) + for field_column in field_columns + }, ) diff --git a/python/tsfile/dataset/timeseries.py b/python/tsfile/dataset/timeseries.py index 35f1614d4..21f17a2ab 100644 --- a/python/tsfile/dataset/timeseries.py +++ b/python/tsfile/dataset/timeseries.py @@ -33,7 +33,9 @@ class AlignedTimeseries: the union of timestamps from the selected logical series. """ - def __init__(self, timestamps: np.ndarray, values: np.ndarray, series_names: List[str]): + def __init__( + self, timestamps: np.ndarray, values: np.ndarray, series_names: List[str] + ): self.timestamps = timestamps self.values = values self.series_names = series_names @@ -49,10 +51,16 @@ def __getitem__(self, key): return self.values[key] def __repr__(self): - return format_aligned_timeseries(self.timestamps, self.values, self.series_names, max_rows=20) + return format_aligned_timeseries( + self.timestamps, self.values, self.series_names, max_rows=20 + ) def show(self, max_rows: Optional[int] = None): - print(format_aligned_timeseries(self.timestamps, self.values, self.series_names, max_rows=max_rows)) + print( + format_aligned_timeseries( + self.timestamps, self.values, self.series_names, max_rows=max_rows + ) + ) class Timeseries: @@ -118,7 +126,9 @@ def __getitem__(self, key): if len(requested_ts) == 0: return np.array([], dtype=np.float64) - ts_arr, values = self._query_time_range(int(np.min(requested_ts)), int(np.max(requested_ts))) + ts_arr, values = self._query_time_range( + int(np.min(requested_ts)), int(np.max(requested_ts)) + ) result = np.full(len(requested_ts), np.nan) if len(ts_arr) > 0: indices = np.searchsorted(ts_arr, requested_ts) @@ -130,7 +140,9 @@ def __getitem__(self, key): raise TypeError(f"Unsupported key type: {type(key)}") - def _query_time_range(self, start_time: int, end_time: int) -> Tuple[np.ndarray, np.ndarray]: + def _query_time_range( + self, start_time: int, end_time: int + ) -> Tuple[np.ndarray, np.ndarray]: self._ensure_open() time_parts = [] value_parts = [] @@ -138,7 +150,9 @@ def _query_time_range(self, start_time: int, end_time: int) -> Tuple[np.ndarray, device_timestamps = reader.get_device_timestamps(device_id) if device_timestamps[-1] < start_time or device_timestamps[0] > end_time: continue - ts_arr, val_arr = reader.read_series_by_ref(device_id, field_idx, start_time, end_time) + ts_arr, val_arr = reader.read_series_by_ref( + device_id, field_idx, start_time, end_time + ) if len(ts_arr) > 0: time_parts.append(ts_arr) value_parts.append(val_arr) diff --git a/python/tsfile/exceptions.py b/python/tsfile/exceptions.py index a02f392ce..a2e930bad 100644 --- a/python/tsfile/exceptions.py +++ b/python/tsfile/exceptions.py @@ -16,6 +16,7 @@ # under the License. # + class LibraryError(Exception): _default_message = "Unknown error occurred" _default_code = -1 @@ -171,5 +172,7 @@ def get_exception(code: int, context: str = None): exc_type = ERROR_MAPPING.get(code) if not exc_type: - return LibraryError(code=code, context=f"Unmapped error code: {code}, message: {context}") + return LibraryError( + code=code, context=f"Unmapped error code: {code}, message: {context}" + ) return exc_type(code=code, context=context) diff --git a/python/tsfile/field.py b/python/tsfile/field.py index 4f3f07983..5d83e25af 100644 --- a/python/tsfile/field.py +++ b/python/tsfile/field.py @@ -75,13 +75,15 @@ def get_int_value(self): raise NoneDataTypeException("None Data Type Exception!") if ( - self.data_type != TSDataType.INT32 - and self.data_type != TSDataType.DATE - and self.data_type != TSDataType.INT64 - and self.data_type != TSDataType.FLOAT - and self.data_type != TSDataType.DOUBLE + self.data_type != TSDataType.INT32 + and self.data_type != TSDataType.DATE + and self.data_type != TSDataType.INT64 + and self.data_type != TSDataType.FLOAT + and self.data_type != TSDataType.DOUBLE ): - raise TypeError(f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}.") + raise TypeError( + f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}." + ) min_int32, max_int32 = np.iinfo(np.int32).min, np.iinfo(np.int32).max if self.data_type == TSDataType.DATE: return parse_date_to_int(self.value) @@ -99,12 +101,14 @@ def get_long_value(self): raise NoneDataTypeException("None Data Type Exception!") if ( - self.data_type != TSDataType.INT32 - and self.data_type != TSDataType.INT64 - and self.data_type != TSDataType.FLOAT - and self.data_type != TSDataType.DOUBLE + self.data_type != TSDataType.INT32 + and self.data_type != TSDataType.INT64 + and self.data_type != TSDataType.FLOAT + and self.data_type != TSDataType.DOUBLE ): - raise TypeError(f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}.") + raise TypeError( + f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}." + ) return np.int64(self.value) @@ -114,10 +118,12 @@ def get_timestamp_value(self): if self.data_type is None: raise NoneDataTypeException("None Data Type Exception!") if ( - self.data_type != TSDataType.TIMESTAMP - and self.data_type != TSDataType.INT64 + self.data_type != TSDataType.TIMESTAMP + and self.data_type != TSDataType.INT64 ): - raise TypeError(f"Expected INT64/TIMESTAMP data type, got {self.data_type}.") + raise TypeError( + f"Expected INT64/TIMESTAMP data type, got {self.data_type}." + ) return np.int64(self.value) def get_float_value(self): @@ -126,12 +132,14 @@ def get_float_value(self): if self.data_type is None: raise NoneDataTypeException("None Data Type Exception!") if ( - self.data_type != TSDataType.INT32 - and self.data_type != TSDataType.INT64 - and self.data_type != TSDataType.FLOAT - and self.data_type != TSDataType.DOUBLE + self.data_type != TSDataType.INT32 + and self.data_type != TSDataType.INT64 + and self.data_type != TSDataType.FLOAT + and self.data_type != TSDataType.DOUBLE ): - raise TypeError(f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}.") + raise TypeError( + f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}." + ) min_float32, max_float32 = np.finfo(np.float32).min, np.finfo(np.float32).max if not (min_float32 <= self.value <= max_float32): raise OverflowError( @@ -146,12 +154,14 @@ def get_double_value(self): if self.data_type is None: raise NoneDataTypeException("None Data Type Exception!") if ( - self.data_type != TSDataType.INT32 - and self.data_type != TSDataType.INT64 - and self.data_type != TSDataType.FLOAT - and self.data_type != TSDataType.DOUBLE + self.data_type != TSDataType.INT32 + and self.data_type != TSDataType.INT64 + and self.data_type != TSDataType.FLOAT + and self.data_type != TSDataType.DOUBLE ): - raise TypeError(f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}.") + raise TypeError( + f"Expected INT32/64 or DOUBLE/FLOAT data type, got {self.data_type}." + ) return np.float64(self.value) def get_date_value(self): @@ -159,10 +169,7 @@ def get_date_value(self): return None if self.data_type is None: raise NoneDataTypeException("None Data Type Exception!") - if ( - self.data_type != TSDataType.DATE - and self.data_type != TSDataType.INT64 - ): + if self.data_type != TSDataType.DATE and self.data_type != TSDataType.INT64: raise TypeError(f"Expected DATE/INT64 data type, got {self.data_type}.") if isinstance(self.value, datetime): return self.value @@ -221,7 +228,11 @@ def get_object_value(self, data_type: TSDataType): return self.value elif isinstance(self.value, int): return parse_int_to_date(self.value) - elif data_type == TSDataType.TEXT or data_type == TSDataType.BLOB or data_type == TSDataType.STRING: + elif ( + data_type == TSDataType.TEXT + or data_type == TSDataType.BLOB + or data_type == TSDataType.STRING + ): return self.value else: raise RuntimeError("Unsupported data type:" + str(data_type)) diff --git a/python/tsfile/row_record.py b/python/tsfile/row_record.py index ba94f1f85..c1d06bcae 100644 --- a/python/tsfile/row_record.py +++ b/python/tsfile/row_record.py @@ -18,6 +18,7 @@ from tsfile.field import Field + class RowRecord(object): def __init__(self, device_id, timestamp, field_list: list = None): self.__timestamp = timestamp diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index ce85f6839..6cfcb090d 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -129,8 +129,13 @@ class TimeseriesSchema: encoding_type = None compression_type = None - def __init__(self, timeseries_name: str, data_type: TSDataType, encoding_type: TSEncoding = TSEncoding.PLAIN, - compression_type: Compressor = Compressor.UNCOMPRESSED): + def __init__( + self, + timeseries_name: str, + data_type: TSDataType, + encoding_type: TSEncoding = TSEncoding.PLAIN, + compression_type: Compressor = Compressor.UNCOMPRESSED, + ): self.timeseries_name = timeseries_name self.data_type = data_type self.encoding_type = encoding_type @@ -178,16 +183,29 @@ class ColumnSchema: column_name = None data_type = None - def __init__(self, column_name: str, data_type: TSDataType, category: ColumnCategory = ColumnCategory.FIELD): + def __init__( + self, + column_name: str, + data_type: TSDataType, + category: ColumnCategory = ColumnCategory.FIELD, + ): if column_name is None or len(column_name) == 0: raise ValueError("Column name cannot be None") self.column_name = column_name.lower() if data_type is None: raise ValueError("Data type cannot be None") - if category == ColumnCategory.TIME and data_type not in [TSDataType.INT64, TSDataType.TIMESTAMP]: - raise TypeError(f"Time Column should have type : INT64/Timestamp," - f" but got {data_type}") - elif category == ColumnCategory.TAG and data_type not in [TSDataType.STRING, TSDataType.TEXT]: + if category == ColumnCategory.TIME and data_type not in [ + TSDataType.INT64, + TSDataType.TIMESTAMP, + ]: + raise TypeError( + f"Time Column should have type : INT64/Timestamp," + f" but got {data_type}" + ) + elif category == ColumnCategory.TAG and data_type not in [ + TSDataType.STRING, + TSDataType.TEXT, + ]: raise TypeMismatchError(context="Tag column should be string or text") self.data_type = data_type self.category = category @@ -207,6 +225,7 @@ def get_category(self): class TableSchema: """Schema definition for a table structure.""" + table_name = None columns = None time_column = None @@ -271,9 +290,7 @@ def add_column(self, column: ColumnSchema): else: for col in self.columns: if col.get_column_name() == column.get_column_name(): - raise ValueError( - f"Duplicate column name {col.get_column_name()}" - ) + raise ValueError(f"Duplicate column name {col.get_column_name()}") self.columns.append(column) def __repr__(self) -> str: @@ -282,6 +299,7 @@ def __repr__(self) -> str: class ResultSetMetaData: """Metadata container for query result sets (columns, types, table name).""" + column_list = None data_types = None table_name = None @@ -296,7 +314,9 @@ def set_table_name(self, table_name: str): def add_column_at(self, index: int, column_name: str, data_type: TSDataType): """Insert a column and its data type at the given position (0-based index).""" if index < 0 or index > len(self.column_list): - raise IndexError(f"column index {index} out of range (0 to {len(self.column_list)})") + raise IndexError( + f"column index {index} out of range (0 to {len(self.column_list)})" + ) self.column_list.insert(index, column_name) self.data_types.insert(index, data_type) diff --git a/python/tsfile/tablet.py b/python/tsfile/tablet.py index 71ecb0633..a27e1fdb8 100644 --- a/python/tsfile/tablet.py +++ b/python/tsfile/tablet.py @@ -37,30 +37,40 @@ class Tablet(object): for numeric types. """ - def __init__(self, column_name_list: list[str], type_list: list[TSDataType], - max_row_num: int = 1024): + def __init__( + self, + column_name_list: list[str], + type_list: list[TSDataType], + max_row_num: int = 1024, + ): self.timestamp_list = [None for _ in range(max_row_num)] self.data_list: List[List[Union[int, float, bool, str, bytes, date, None]]] = [ [None for _ in range(max_row_num)] for _ in range(len(column_name_list)) ] self.target_name = None - self.column_name_list = [column_name.lower() for column_name in column_name_list] + self.column_name_list = [ + column_name.lower() for column_name in column_name_list + ] self.type_list = type_list self.max_row_num = max_row_num self._type_ranges = { - TSDataType.INT32: (-2 ** 31, 2 ** 31 - 1), - TSDataType.INT64: (-2 ** 63, 2 ** 63 - 1), + TSDataType.INT32: (-(2**31), 2**31 - 1), + TSDataType.INT64: (-(2**63), 2**63 - 1), TSDataType.FLOAT: (np.finfo(np.float32).min, np.finfo(np.float32).max), TSDataType.DOUBLE: (np.finfo(np.float64).min, np.finfo(np.float64).max), } def _check_index(self, col_index: int, row_index: int): if not (0 <= col_index < len(self.column_name_list)): - raise IndexError(f"column index {col_index} out of range [0, {len(self.column_name_list) - 1}]") + raise IndexError( + f"column index {col_index} out of range [0, {len(self.column_name_list) - 1}]" + ) if not (0 <= row_index < self.max_row_num): - raise IndexError(f"Row index {row_index} out of range [0, {self.max_row_num - 1}]") + raise IndexError( + f"Row index {row_index} out of range [0, {self.max_row_num - 1}]" + ) def set_table_name(self, table_name: str): self.target_name = table_name @@ -101,14 +111,21 @@ def add_timestamp(self, row_index: int, timestamp: int): def _check_numeric_range(self, value: Union[int, float], data_type: TSDataType): if math.isnan(value) or math.isinf(value): if data_type == TSDataType.INT32 or data_type == TSDataType.INT64: - raise ValueError(f"NaN/Inf is invalid for integer type {data_type.name}") + raise ValueError( + f"NaN/Inf is invalid for integer type {data_type.name}" + ) else: return min_val, max_val = self._type_ranges[data_type] if not (min_val <= value <= max_val): raise OverflowError(f"data:{value} out of range ({min_val}, {max_val})") - def add_value_by_name(self, column_name: str, row_index: int, value: Union[int, float, bool, str, bytes]): + def add_value_by_name( + self, + column_name: str, + row_index: int, + value: Union[int, float, bool, str, bytes], + ): try: col_index = self.column_name_list.index(column_name.lower()) except ValueError: @@ -124,18 +141,30 @@ def add_value_by_name(self, column_name: str, row_index: int, value: Union[int, if not isinstance(value, expected_type.to_py_type()): raise TypeError(f"Expected {expected_type.to_py_type()} got {type(value)}") - if expected_type in (TSDataType.INT32, TSDataType.INT64, TSDataType.FLOAT, TSDataType.DOUBLE): + if expected_type in ( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + ): self._check_numeric_range(value, expected_type) self.data_list[col_index][row_index] = value - def add_value_by_index(self, col_index: int, row_index: int, value: Union[int, float, bool, str, bytes]): + def add_value_by_index( + self, col_index: int, row_index: int, value: Union[int, float, bool, str, bytes] + ): self._check_index(col_index, row_index) expected_type = self.type_list[col_index] if not isinstance(value, expected_type.to_py_type()): raise TypeError(f"Expected {expected_type.to_py_type()} got {type(value)}") - if expected_type in (TSDataType.INT32, TSDataType.INT64, TSDataType.FLOAT, TSDataType.DOUBLE): + if expected_type in ( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + ): self._check_numeric_range(value, expected_type) self.data_list[col_index][row_index] = value diff --git a/python/tsfile/tag_filter.py b/python/tsfile/tag_filter.py index e5107263f..a40c0c47c 100644 --- a/python/tsfile/tag_filter.py +++ b/python/tsfile/tag_filter.py @@ -49,9 +49,19 @@ def __init__(self, column_name: str, value: str, op: int): self.op = op def __repr__(self): - op_names = {0: "==", 1: "!=", 2: "<", 3: "<=", 4: ">", 5: ">=", - 6: "=~", 7: "!~"} - return f"TagFilter({self.column_name} {op_names.get(self.op, '?')} {self.value!r})" + op_names = { + 0: "==", + 1: "!=", + 2: "<", + 3: "<=", + 4: ">", + 5: ">=", + 6: "=~", + 7: "!~", + } + return ( + f"TagFilter({self.column_name} {op_names.get(self.op, '?')} {self.value!r})" + ) class BetweenTagFilter(TagFilter): diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index cfd817fec..acc5fefb1 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -55,9 +55,7 @@ def validate_dataframe_for_tsfile(df: pd.DataFrame) -> None: if unsupported: msg_parts = [f" - {col}: dtype={dtype}" for col, dtype in unsupported] - raise ValueError( - "Data types not supported by tsfile:\n" + "\n".join(msg_parts) - ) + raise ValueError("Data types not supported by tsfile:\n" + "\n".join(msg_parts)) def infer_object_column_type(column_series: pd.Series) -> TSDataType: @@ -89,7 +87,9 @@ class TsFileTableWriter: according to that schema, and serialize this data into a TsFile. """ - def __init__(self, path: str, table_schema: TableSchema, memory_threshold = 128 * 1024 * 1024): + def __init__( + self, path: str, table_schema: TableSchema, memory_threshold=128 * 1024 * 1024 + ): """ :param path: The path of tsfile, will create if it doesn't exist. :param table_schema: describes the schema of the tables they want to write. @@ -108,8 +108,10 @@ def write_table(self, tablet: Tablet): """ if tablet.get_target_name() is None: tablet.set_table_name(self.tableSchema.get_table_name()) - elif (self.tableSchema.get_table_name() is not None - and tablet.get_target_name() != self.tableSchema.get_table_name()): + elif ( + self.tableSchema.get_table_name() is not None + and tablet.get_target_name() != self.tableSchema.get_table_name() + ): raise TableNotExistError self.writer.write_table(tablet) @@ -131,17 +133,16 @@ def write_dataframe(self, dataframe: pd.DataFrame): # tag columns used for sorting tag_columns = self.tableSchema.get_tag_columns() if time_column is None: - if 'time' in dataframe.columns: - dtype = TSDataType.from_pandas_datatype(dataframe['time'].dtype) + if "time" in dataframe.columns: + dtype = TSDataType.from_pandas_datatype(dataframe["time"].dtype) if not TSDataType.TIMESTAMP.is_compatible_with(dtype): raise TypeMismatchError( - code=27, - context=f"time column require INT/Timestamp" + code=27, context=f"time column require INT/Timestamp" ) - self.tableSchema.add_column(ColumnSchema("time", - TSDataType.TIMESTAMP, - ColumnCategory.TIME)) + self.tableSchema.add_column( + ColumnSchema("time", TSDataType.TIMESTAMP, ColumnCategory.TIME) + ) time_column = self.tableSchema.get_time_column() type_mismatches = [] @@ -150,10 +151,18 @@ def write_dataframe(self, dataframe: pd.DataFrame): continue schema_col = self.tableSchema.get_column(col_name) if schema_col is None: - raise ColumnNotExistError(context=f"{col_name} is not define in table schema") + raise ColumnNotExistError( + context=f"{col_name} is not define in table schema" + ) # Object dtype can represent STRING, DATE, TEXT, BLOB; validation will be performed during insert, skip here - if schema_col.get_data_type() in [TSDataType.INT64, TSDataType.INT32, TSDataType.DOUBLE, TSDataType.FLOAT, - TSDataType.BOOLEAN, TSDataType.TIMESTAMP]: + if schema_col.get_data_type() in [ + TSDataType.INT64, + TSDataType.INT32, + TSDataType.DOUBLE, + TSDataType.FLOAT, + TSDataType.BOOLEAN, + TSDataType.TIMESTAMP, + ]: df_dtype = dataframe[col_name].dtype df_ts_type = TSDataType.from_pandas_datatype(df_dtype) expected_ts_type = schema_col.get_data_type() @@ -165,8 +174,7 @@ def write_dataframe(self, dataframe: pd.DataFrame): if type_mismatches: raise TypeMismatchError( - code=27, - context=f"Type mismatches: {'; '.join(type_mismatches)}" + code=27, context=f"Type mismatches: {'; '.join(type_mismatches)}" ) if time_column: @@ -180,7 +188,9 @@ def write_dataframe(self, dataframe: pd.DataFrame): sort_by.append(time_column_name) dataframe = dataframe.sort_values(by=sort_by) - self.writer.write_dataframe(self.tableSchema.get_table_name(), dataframe, self.tableSchema) + self.writer.write_dataframe( + self.tableSchema.get_table_name(), dataframe, self.tableSchema + ) def write_arrow_batch(self, data): """ @@ -198,8 +208,12 @@ def write_arrow_batch(self, data): time_col_index = data.schema.get_field_index(time_col_name) if time_col_index < 0: - raise ValueError(f"Time column '{time_col_name}' not found in Arrow schema.") - self.writer.write_arrow_batch(self.tableSchema.get_table_name(), data, time_col_index) + raise ValueError( + f"Time column '{time_col_name}' not found in Arrow schema." + ) + self.writer.write_arrow_batch( + self.tableSchema.get_table_name(), data, time_col_index + ) def close(self): """ diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 1d9a89975..ea9263149 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -26,73 +26,79 @@ from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType, TIME_COLUMN from tsfile.exceptions import TableNotExistError, ColumnNotExistError from tsfile.tsfile_reader import TsFileReaderPy -from tsfile.tsfile_table_writer import TsFileTableWriter, infer_object_column_type, validate_dataframe_for_tsfile +from tsfile.tsfile_table_writer import ( + TsFileTableWriter, + infer_object_column_type, + validate_dataframe_for_tsfile, +) + + +def to_dataframe( + file_path: str, + table_name: Optional[str] = None, + column_names: Optional[list[str]] = None, + start_time: Optional[int] = None, + end_time: Optional[int] = None, + max_row_num: Optional[int] = None, + as_iterator: bool = False, +) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """ + Read data from a TsFile and convert it into a Pandas DataFrame or + an iterator of DataFrames. + + This function supports both table-model and tree-model TsFiles. + Users can filter data by table name, column names, time range, + and maximum number of rows. + + Parameters + ---------- + file_path : str + Path to the TsFile to be read. + table_name : Optional[str], default None + Name of the table to query in table-model TsFiles. + If None and the file is in table model, the first table + found in the schema will be used. + + column_names : Optional[list[str]], default None + List of column names to query. + - If None, all columns will be returned. + - Column existence will be validated in table-model TsFiles. + + start_time : Optional[int], default None + Start timestamp for the query. + If None, the minimum int64 value is used. + + end_time : Optional[int], default None + End timestamp for the query. + If None, the maximum int64 value is used. + + max_row_num : Optional[int], default None + Maximum number of rows to read. + - If None, all available rows will be returned. + - When `as_iterator` is False, the final DataFrame will be + truncated to this size if necessary. + + as_iterator : bool, default False + Whether to return an iterator of DataFrames instead of + a single concatenated DataFrame. + - True: returns an iterator yielding DataFrames in batches + - False: returns a single Pandas DataFrame -def to_dataframe(file_path: str, - table_name: Optional[str] = None, - column_names: Optional[list[str]] = None, - start_time: Optional[int] = None, - end_time: Optional[int] = None, - max_row_num: Optional[int] = None, - as_iterator: bool = False) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + Returns + ------- + Union[pandas.DataFrame, Iterator[pandas.DataFrame]] + - A Pandas DataFrame if `as_iterator` is False + - An iterator of Pandas DataFrames if `as_iterator` is True + + Raises + ------ + TableNotExistError + If the specified table name does not exist in a table-model TsFile. + + ColumnNotExistError + If any specified column does not exist in the table schema. """ - Read data from a TsFile and convert it into a Pandas DataFrame or - an iterator of DataFrames. - - This function supports both table-model and tree-model TsFiles. - Users can filter data by table name, column names, time range, - and maximum number of rows. - - Parameters - ---------- - file_path : str - Path to the TsFile to be read. - - table_name : Optional[str], default None - Name of the table to query in table-model TsFiles. - If None and the file is in table model, the first table - found in the schema will be used. - - column_names : Optional[list[str]], default None - List of column names to query. - - If None, all columns will be returned. - - Column existence will be validated in table-model TsFiles. - - start_time : Optional[int], default None - Start timestamp for the query. - If None, the minimum int64 value is used. - - end_time : Optional[int], default None - End timestamp for the query. - If None, the maximum int64 value is used. - - max_row_num : Optional[int], default None - Maximum number of rows to read. - - If None, all available rows will be returned. - - When `as_iterator` is False, the final DataFrame will be - truncated to this size if necessary. - - as_iterator : bool, default False - Whether to return an iterator of DataFrames instead of - a single concatenated DataFrame. - - True: returns an iterator yielding DataFrames in batches - - False: returns a single Pandas DataFrame - - Returns - ------- - Union[pandas.DataFrame, Iterator[pandas.DataFrame]] - - A Pandas DataFrame if `as_iterator` is False - - An iterator of Pandas DataFrames if `as_iterator` is True - - Raises - ------ - TableNotExistError - If the specified table name does not exist in a table-model TsFile. - - ColumnNotExistError - If any specified column does not exist in the table schema. - """ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: _table_name = table_name @@ -126,7 +132,11 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: no_field_query = False else: _table_name = _table_name.lower() if _table_name else None - _column_names = [column.lower() for column in _column_names] if _column_names else None + _column_names = ( + [column.lower() for column in _column_names] + if _column_names + else None + ) if _table_name is None: _table_name, table_schema = next(iter(table_schema.items())) else: @@ -146,7 +156,10 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: for column in _column_names: if column not in column_names_in_file and column != time_column: raise ColumnNotExistError(column) - if table_schema.get_column(column).get_category() == ColumnCategory.FIELD: + if ( + table_schema.get_column(column).get_category() + == ColumnCategory.FIELD + ): no_field_query = False if no_field_query: if time_column is not None: @@ -161,9 +174,13 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: if is_tree_model: if _column_names is not None: column_name_to_query = _column_names - query_result = reader.query_table_on_tree(column_name_to_query, _start_time, _end_time) + query_result = reader.query_table_on_tree( + column_name_to_query, _start_time, _end_time + ) else: - query_result = reader.query_table(_table_name, column_name_to_query, _start_time, _end_time) + query_result = reader.query_table( + _table_name, column_name_to_query, _start_time, _end_time + ) with query_result as result: while result.next(): @@ -181,12 +198,18 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: total_rows += len(dataframe) if time_column is not None: if _column_names is None or time_column not in _column_names: - dataframe = dataframe.rename(columns={dataframe.columns[0]: time_column}) + dataframe = dataframe.rename( + columns={dataframe.columns[0]: time_column} + ) if no_field_query and _column_names is not None: _column_names.insert(0, TIME_COLUMN) dataframe = dataframe[_column_names] yield dataframe - if (not is_iterator) and max_row_num is not None and total_rows >= max_row_num: + if ( + (not is_iterator) + and max_row_num is not None + and total_rows >= max_row_num + ): break if as_iterator: @@ -202,12 +225,13 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: return pd.DataFrame() -def dataframe_to_tsfile(dataframe: pd.DataFrame, - file_path: str, - table_name: Optional[str] = None, - time_column: Optional[str] = None, - tag_column: Optional[list[str]] = None, - ): +def dataframe_to_tsfile( + dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None, +): """ Write a pandas DataFrame to a TsFile by inferring the table schema from the DataFrame. @@ -262,19 +286,22 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, if time_column is not None: time_col_name = time_column.lower() - elif 'time' in df.columns: - time_col_name = 'time' + elif "time" in df.columns: + time_col_name = "time" else: time_col_name = None if time_col_name is not None: if not is_integer_dtype(df[time_col_name].dtype): raise TypeError( - f"Time column '{time_col_name}' must be integer type (int64 or int), got {df[time_col_name].dtype}") + f"Time column '{time_col_name}' must be integer type (int64 or int), got {df[time_col_name].dtype}" + ) column_schemas = [] if time_col_name is not None: - column_schemas.append(ColumnSchema(time_col_name, TSDataType.TIMESTAMP, ColumnCategory.TIME)) + column_schemas.append( + ColumnSchema(time_col_name, TSDataType.TIMESTAMP, ColumnCategory.TIME) + ) for col in df.columns: if col == time_col_name: @@ -285,12 +312,18 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, else: ts_data_type = TSDataType.from_pandas_datatype(col_dtype) - category = ColumnCategory.TAG if col in tag_columns_lower else ColumnCategory.FIELD + category = ( + ColumnCategory.TAG if col in tag_columns_lower else ColumnCategory.FIELD + ) column_schemas.append(ColumnSchema(col, ts_data_type, category)) - data_columns = [s for s in column_schemas if s.get_category() != ColumnCategory.TIME] + data_columns = [ + s for s in column_schemas if s.get_category() != ColumnCategory.TIME + ] if len(data_columns) == 0: - raise ValueError("DataFrame must have at least one data column besides the time column") + raise ValueError( + "DataFrame must have at least one data column besides the time column" + ) table_schema = TableSchema(table_name, column_schemas)