Skip to content

Commit aa86fc2

Browse files
ritunjaymclaude
andcommitted
fix: resolve flake8 linting errors
Unused imports (F401): - sidecar/embedding_service.py: remove List, numpy - sidecar/index_service.py: remove Optional - spark/jobs/incremental_ingest.py: remove col F-strings without placeholders (F541): - scripts/build_faiss_index.py: f"Index saved successfully!" → plain string - sidecar/server.py: f"Configuration:" → plain string Alignment spaces (E221): - scripts/prepare_demo_data.py: remove extra spaces on 9 constant assignments, _make_text locals, generate_embeddings locals, and build_faiss_index locals Long lines >120 chars (E501): - sidecar/embedding_service.py:37: extract dim variable - sidecar/index_service.py:186: wrap set_details call - spark/jobs/ingest_and_embed.py:203: wrap df.select call Test import order (E402): - Move sys.path.insert above third-party imports; add noqa: E402 markers Unused variables (F841): - test_embedding_service.py: drop response= in empty-text and empty-batch tests - test_index_service.py: drop response= in wrong-shard and wrong-dimension tests Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 92061ce commit aa86fc2

9 files changed

Lines changed: 47 additions & 44 deletions

scripts/build_faiss_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def save_index(index, output_path):
143143

144144
# Print index statistics
145145
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
146-
logger.info(f"Index saved successfully!")
146+
logger.info("Index saved successfully!")
147147
logger.info(f" - File size: {file_size_mb:.2f} MB")
148148
logger.info(f" - Total vectors: {index.ntotal}")
149149
logger.info(f" - Dimension: {index.d}")

scripts/prepare_demo_data.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,15 @@
3333
import vector_service_pb2_grpc # noqa: E402
3434

3535
# ── Configuration ─────────────────────────────────────────────────────────────
36-
SIDECAR_ADDR = os.getenv("SIDECAR_ADDR", "localhost:50051")
37-
TAXI_DATA_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
38-
RAW_FILE = os.path.join(REPO_ROOT, "data", "raw", "yellow_tripdata_2023-01.parquet")
39-
DEMO_FILE = os.path.join(REPO_ROOT, "data", "demo", "taxi_trips_10k.parquet")
40-
INDEX_DIR = os.path.join(REPO_ROOT, "data", "indexes")
41-
INDEX_FILE = os.path.join(INDEX_DIR, "nyc_taxi_2023.index")
42-
SAMPLE_SIZE = 10_000
43-
RANDOM_SEED = 42
44-
BATCH_SIZE = 256 # texts per gRPC batch call
36+
SIDECAR_ADDR = os.getenv("SIDECAR_ADDR", "localhost:50051")
37+
TAXI_DATA_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
38+
RAW_FILE = os.path.join(REPO_ROOT, "data", "raw", "yellow_tripdata_2023-01.parquet")
39+
DEMO_FILE = os.path.join(REPO_ROOT, "data", "demo", "taxi_trips_10k.parquet")
40+
INDEX_DIR = os.path.join(REPO_ROOT, "data", "indexes")
41+
INDEX_FILE = os.path.join(INDEX_DIR, "nyc_taxi_2023.index")
42+
SAMPLE_SIZE = 10_000
43+
RANDOM_SEED = 42
44+
BATCH_SIZE = 256 # texts per gRPC batch call
4545

4646

4747
# ── Step 1: Download and sample ───────────────────────────────────────────────
@@ -60,7 +60,7 @@ def report(block, block_size, total):
6060
downloaded = block * block_size
6161
if total > 0:
6262
pct = min(100, downloaded * 100 // total)
63-
mb = downloaded / 1_048_576
63+
mb = downloaded / 1_048_576
6464
print(f"\r {pct}% ({mb:.0f} MB)", end="", flush=True)
6565

6666
urllib.request.urlretrieve(TAXI_DATA_URL, RAW_FILE, reporthook=report)
@@ -107,12 +107,12 @@ def _check_sidecar():
107107

108108
def _make_text(row) -> str:
109109
"""Convert a taxi trip row into a natural-language string for embedding."""
110-
pu = int(row.get('PULocationID', 0))
111-
do = int(row.get('DOLocationID', 0))
112-
dist = float(row.get('trip_distance', 0))
113-
fare = float(row.get('fare_amount', 0))
110+
pu = int(row.get('PULocationID', 0))
111+
do = int(row.get('DOLocationID', 0))
112+
dist = float(row.get('trip_distance', 0))
113+
fare = float(row.get('fare_amount', 0))
114114
passengers = int(row.get('passenger_count', 1))
115-
pax = "passengers" if passengers > 1 else "passenger"
115+
pax = "passengers" if passengers > 1 else "passenger"
116116
return (
117117
f"Yellow taxi trip from zone {pu} to zone {do}, "
118118
f"{dist:.1f} miles, ${fare:.2f} fare, {passengers} {pax}"
@@ -129,14 +129,14 @@ def generate_embeddings(demo_file: str) -> np.ndarray:
129129
texts = [_make_text(row) for _, row in df.iterrows()]
130130

131131
channel = grpc.insecure_channel(SIDECAR_ADDR)
132-
stub = vector_service_pb2_grpc.EmbeddingServiceStub(channel)
132+
stub = vector_service_pb2_grpc.EmbeddingServiceStub(channel)
133133

134134
all_embeddings = []
135135
total = len(texts)
136136

137137
for start in range(0, total, BATCH_SIZE):
138138
batch = texts[start : start + BATCH_SIZE]
139-
request = vector_service_pb2.EmbeddingBatchRequest(texts=batch)
139+
request = vector_service_pb2.EmbeddingBatchRequest(texts=batch)
140140
response = stub.GenerateEmbeddingBatch(request)
141141
for emb in response.embeddings:
142142
all_embeddings.append(emb.vector) # field name is `vector`
@@ -163,11 +163,11 @@ def build_faiss_index(embeddings: np.ndarray) -> str:
163163
# For 10K vectors: nlist=32 gives ~300 vectors/cell (√10K ≈ 100, but 32
164164
# is safer for training), m=8 subvectors × 8 bits = 1 byte/subvector
165165
nlist = 32
166-
m = 8
166+
m = 8
167167
nbits = 8
168168

169169
quantizer = faiss.IndexFlatL2(d)
170-
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits)
170+
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits)
171171

172172
print(f" Training IVF{nlist},PQ{m}×{nbits} on {n:,} vectors...")
173173
index.train(embeddings)
@@ -208,7 +208,7 @@ def main():
208208
return
209209

210210
try:
211-
demo_file = download_sample()
211+
demo_file = download_sample()
212212
embeddings = generate_embeddings(demo_file)
213213
build_faiss_index(embeddings)
214214

sidecar/embedding_service.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
Supports single and batch embedding generation.
55
"""
66
import logging
7-
from typing import List
87
import grpc
9-
import numpy as np
108
from sentence_transformers import SentenceTransformer
119
import vector_service_pb2
1210
import vector_service_pb2_grpc
@@ -34,7 +32,8 @@ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
3432
logger.info(f"Loading embedding model: {model_name}")
3533
self.model = SentenceTransformer(model_name)
3634
self.model_name = model_name
37-
logger.info(f"Model {model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
35+
dim = self.model.get_sentence_embedding_dimension()
36+
logger.info(f"Model {model_name} loaded successfully. Embedding dimension: {dim}")
3837

3938
def GenerateEmbedding(self, request, context):
4039
"""

sidecar/index_service.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import os
88
import threading
9-
from typing import Dict, Optional
9+
from typing import Dict
1010
import grpc
1111
import faiss
1212
import numpy as np
@@ -183,7 +183,9 @@ def SearchIndex(self, request, context):
183183

184184
if query_vector.shape[0] != shard.dimension:
185185
context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
186-
context.set_details(f"Query dimension {query_vector.shape[0]} does not match index dimension {shard.dimension}")
186+
context.set_details(
187+
f"Query dimension {query_vector.shape[0]} does not match index dimension {shard.dimension}"
188+
)
187189
return vector_service_pb2.SearchResponse()
188190

189191
top_k = request.top_k if request.top_k > 0 else 10

sidecar/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def signal_handler(sig, frame):
8181
signal.signal(signal.SIGTERM, signal_handler)
8282

8383
logger.info(f"Starting gRPC server on port {GRPC_PORT}")
84-
logger.info(f"Configuration:")
84+
logger.info("Configuration:")
8585
logger.info(f" - Embedding Model: {EMBEDDING_MODEL}")
8686
logger.info(f" - Index Directory: {INDEX_DIR}")
8787
logger.info(f" - Max Workers: {MAX_WORKERS}")

sidecar/tests/test_embedding_service.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
"""
55
import sys
66
import os
7-
import pytest
8-
import numpy as np
97

108
# Add parent directory to path so we can import sidecar modules
119
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
1210

13-
from embedding_service import EmbeddingServiceImpl
14-
import vector_service_pb2
11+
import pytest # noqa: E402
12+
import numpy as np # noqa: E402
13+
from embedding_service import EmbeddingServiceImpl # noqa: E402
14+
import vector_service_pb2 # noqa: E402
1515

1616

1717
class MockContext:
@@ -58,7 +58,7 @@ def test_generate_embedding_empty_text(embedding_service):
5858
request = vector_service_pb2.EmbeddingRequest(text="", model_name="all-MiniLM-L6-v2")
5959
context = MockContext()
6060

61-
response = embedding_service.GenerateEmbedding(request, context)
61+
embedding_service.GenerateEmbedding(request, context)
6262

6363
assert context.code is not None # Should set error code
6464
assert "empty" in context.details.lower()
@@ -105,7 +105,7 @@ def test_generate_embedding_batch_empty(embedding_service):
105105
request = vector_service_pb2.EmbeddingBatchRequest(texts=[])
106106
context = MockContext()
107107

108-
response = embedding_service.GenerateEmbeddingBatch(request, context)
108+
embedding_service.GenerateEmbeddingBatch(request, context)
109109

110110
assert context.code is not None
111111
assert "required" in context.details.lower() or "empty" in context.details.lower()

sidecar/tests/test_index_service.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@
44
"""
55
import sys
66
import os
7-
import pytest
8-
import tempfile
9-
import shutil
10-
import numpy as np
11-
import faiss
127

138
# Add parent directory to path
149
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
1510

16-
from index_service import IndexServiceImpl, ShardIndex
17-
import vector_service_pb2
11+
import pytest # noqa: E402
12+
import tempfile # noqa: E402
13+
import shutil # noqa: E402
14+
import numpy as np # noqa: E402
15+
import faiss # noqa: E402
16+
from index_service import IndexServiceImpl, ShardIndex # noqa: E402
17+
import vector_service_pb2 # noqa: E402
1818

1919

2020
class MockContext:
@@ -139,7 +139,7 @@ def test_index_service_search_wrong_shard(temp_index_dir):
139139
)
140140
context = MockContext()
141141

142-
response = service.SearchIndex(request, context)
142+
service.SearchIndex(request, context)
143143

144144
assert context.code is not None # Should set error code
145145
assert "not found" in context.details.lower()
@@ -159,7 +159,7 @@ def test_index_service_search_wrong_dimension(temp_index_dir):
159159
)
160160
context = MockContext()
161161

162-
response = service.SearchIndex(request, context)
162+
service.SearchIndex(request, context)
163163

164164
assert context.code is not None
165165
assert "dimension" in context.details.lower()

spark/jobs/incremental_ingest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66
from delta.tables import DeltaTable
77
from pyspark.sql import SparkSession
8-
from pyspark.sql.functions import col, current_timestamp
8+
from pyspark.sql.functions import current_timestamp
99
import argparse
1010

1111

spark/jobs/ingest_and_embed.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ def main():
200200

201201
# Show sample results
202202
logger.info("Sample embedded records:")
203-
df_final.select("PULocationID", "DOLocationID", "trip_distance", "fare_amount", "embedding_dimension").show(5, truncate=False)
203+
df_final.select(
204+
"PULocationID", "DOLocationID", "trip_distance", "fare_amount", "embedding_dimension"
205+
).show(5, truncate=False)
204206

205207
except Exception as e:
206208
logger.error(f"Job failed with error: {e}", exc_info=True)

0 commit comments

Comments
 (0)