Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions python/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@

from tsfile import ColumnSchema, TableSchema
from tsfile import Tablet
from tsfile import TsFileTableWriter, TsFileReader, TSDataType, TSEncoding, Compressor, ColumnCategory
from tsfile import (
TsFileTableWriter,
TsFileReader,
TSDataType,
TSEncoding,
Compressor,
ColumnCategory,
)

## Write
table_data_dir = os.path.join(os.path.dirname(__file__), "table_data.tsfile")
Expand All @@ -36,9 +43,10 @@
with TsFileTableWriter(table_data_dir, table_schema) as writer:
tablet_row_num = 100
tablet = Tablet(
["id", "id2", "value"],
[TSDataType.STRING, TSDataType.STRING, TSDataType.FLOAT],
tablet_row_num)
["id", "id2", "value"],
[TSDataType.STRING, TSDataType.STRING, TSDataType.FLOAT],
tablet_row_num,
)

for i in range(tablet_row_num):
tablet.add_timestamp(i, i * 10)
Expand All @@ -57,4 +65,3 @@
print(result.get_value_by_name("id2"))
print(result.get_value_by_name("value"))
print(result.read_data_frame())

30 changes: 30 additions & 0 deletions python/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,36 @@
<build>
<sourceDirectory>${project.basedir}</sourceDirectory>
<plugins>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>${spotless.version}</version>
<configuration>
<python>
<includes>
<include>examples/**/*.py</include>
<include>tests/**/*.py</include>
<include>tsfile/**/*.py</include>
<include>setup.py</include>
</includes>
<black>
<version>24.10.0</version>
<pathToExe>${project.basedir}/${python.venv.bin}black</pathToExe>
</black>
</python>
<lineEndings>UNIX</lineEndings>
<skip>${spotless.skip}</skip>
</configuration>
<executions>
<execution>
<id>spotless-check</id>
<goals>
<goal>check</goal>
</goals>
<phase>validate</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#

cython==3.0.10
black==24.10.0
numpy>=2.0.0,<3
pandas==2.2.2
setuptools==78.1.1
wheel==0.46.2
pyarrow>=8.0.0

26 changes: 20 additions & 6 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
shutil.rmtree(PKG / "include")
shutil.copytree(CPP_INC, PKG / "include")
if sys.platform.startswith("linux"):
candidates = sorted(CPP_LIB.glob("libtsfile.so*"), key=lambda p: len(p.name), reverse=True)
candidates = sorted(
CPP_LIB.glob("libtsfile.so*"), key=lambda p: len(p.name), reverse=True
)
if not candidates:
raise FileNotFoundError("missing libtsfile.so* in build output")
src = candidates[0]
Expand All @@ -51,7 +53,9 @@
shutil.copy2(src, link_name)

elif sys.platform == "darwin":
candidates = sorted(CPP_LIB.glob("libtsfile.*.dylib")) or list(CPP_LIB.glob("libtsfile.dylib"))
candidates = sorted(CPP_LIB.glob("libtsfile.*.dylib")) or list(
CPP_LIB.glob("libtsfile.dylib")
)
if not candidates:
raise FileNotFoundError("missing libtsfile*.dylib in build output")
src = candidates[0]
Expand All @@ -61,8 +65,12 @@
shutil.copy2(src, link_name)
elif sys.platform == "win32":
for base_name in ("libtsfile",):
dll_candidates = sorted(CPP_LIB.glob(f"{base_name}*.dll"), key=lambda p: len(p.name), reverse=True)
dll_a_candidates = sorted(CPP_LIB.glob(f"{base_name}*.dll.a"), key=lambda p: len(p.name), reverse=True)
dll_candidates = sorted(
CPP_LIB.glob(f"{base_name}*.dll"), key=lambda p: len(p.name), reverse=True
)
dll_a_candidates = sorted(
CPP_LIB.glob(f"{base_name}*.dll.a"), key=lambda p: len(p.name), reverse=True
)

if not dll_candidates:
raise FileNotFoundError(f"missing {base_name}*.dll in build output")
Expand Down Expand Up @@ -119,8 +127,14 @@ def finalize_options(self):
extra_link_args += ["-Wl,-rpath,@loader_path", "-stdlib=libc++"]
elif sys.platform == "win32":
libraries = ["tsfile"]
extra_compile_args += ["-O2", "-std=c++11", "-DSIZEOF_VOID_P=8", "-D__USE_MINGW_ANSI_STDIO=1", "-DMS_WIN64",
"-D_WIN64"]
extra_compile_args += [
"-O2",
"-std=c++11",
"-DSIZEOF_VOID_P=8",
"-D__USE_MINGW_ANSI_STDIO=1",
"-DMS_WIN64",
"-D_WIN64",
]
extra_link_args += []
else:
raise RuntimeError(f"Unsupported platform: {sys.platform}")
Expand Down
27 changes: 18 additions & 9 deletions python/tests/bench_batch_arrow_vs_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,15 @@ def _ensure_bench_tsfile(file_path: str, row_count: int) -> None:
remove(file_path)
# Build data with pandas/numpy (vectorized, much faster than row-by-row Tablet)
import numpy as np
df = pd.DataFrame({
"time": np.arange(row_count, dtype=np.int64),
"device": pd.Series([f"device" for i in range(row_count)]),
"value1": np.arange(0, row_count * 10, 10, dtype=np.int64),
"value2": np.arange(row_count, dtype=np.float64) * 1.5,
})

df = pd.DataFrame(
{
"time": np.arange(row_count, dtype=np.int64),
"device": pd.Series([f"device" for i in range(row_count)]),
"value1": np.arange(0, row_count * 10, 10, dtype=np.int64),
"value2": np.arange(row_count, dtype=np.float64) * 1.5,
}
)

table = TableSchema(
TABLE_NAME,
Expand Down Expand Up @@ -135,7 +138,9 @@ def _run_timed(name: str, func, *args, rounds: int = DEFAULT_TIMED_ROUNDS):
avg = sum(times) / len(times)
total_rows = n
rows_per_sec = total_rows / avg if avg > 0 else 0
print(f" {name}: {avg:.3f}s avg ({min(times):.3f}s min) rows={total_rows} {rows_per_sec:.0f} rows/s")
print(
f" {name}: {avg:.3f}s avg ({min(times):.3f}s min) rows={total_rows} {rows_per_sec:.0f} rows/s"
)
return avg, total_rows


Expand All @@ -148,7 +153,9 @@ def run_benchmark(
_ensure_bench_tsfile(file_path, row_count)
end_time = row_count + 1

print(f"Benchmark: {row_count} rows, batch_size={batch_size}, timed_rounds={timed_rounds}")
print(
f"Benchmark: {row_count} rows, batch_size={batch_size}, timed_rounds={timed_rounds}"
)

df_avg, df_rows = _run_timed(
"query_table + read_data_frame",
Expand All @@ -170,7 +177,9 @@ def run_benchmark(
print()
if df_avg > 0:
speedup = arrow_avg / df_avg
print(f" Arrow vs DataFrame time ratio: {speedup:.2f}x ({'Arrow faster' if speedup < 1 else 'DataFrame faster'})")
print(
f" Arrow vs DataFrame time ratio: {speedup:.2f}x ({'Arrow faster' if speedup < 1 else 'DataFrame faster'})"
)
assert df_rows == row_count, f"DataFrame path row count {df_rows} != {row_count}"
assert arrow_rows == row_count, f"Arrow path row count {arrow_rows} != {row_count}"

Expand Down
Loading
Loading