-
Notifications
You must be signed in to change notification settings - Fork 0
trying to run celery queue with airflow #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,20 +7,9 @@ | |
| import polars as pl | ||
|
|
||
|
|
||
| today = datetime.now().strftime("%Y-%m-%d") | ||
| def save_to_parquet(the_data): | ||
| today = datetime.now().strftime("%Y-%m-%d") | ||
|
|
||
| print("Waiting for Celery task to complete") | ||
|
|
||
| try: | ||
| print("Getting the result") | ||
| response = build_repo_chord(total=5000, batch_size=500) | ||
| the_data = response.get(timeout=3600) # 1 hour timeout | ||
| print(f"Result: {the_data}") | ||
|
|
||
| except Exception as e: | ||
| print(f"Error: {e}") | ||
|
|
||
| else: | ||
| if not Path(f"data/{today}/").exists(): | ||
| Path(f"data/{today}").mkdir(parents=True, exist_ok=True) | ||
|
|
||
|
|
@@ -30,3 +19,20 @@ | |
| df = pl.DataFrame(the_data) | ||
| df.write_parquet(f"data/{today}/github_data.parquet", compression="zstd") | ||
| print("Valid Parquet data") | ||
|
|
||
|
|
||
| def get_data_from_queue(): | ||
| try: | ||
| print("Getting the result") | ||
| response = build_repo_chord(total=5000, batch_size=500) | ||
| the_data = response.get(timeout=3600) # 1 hour timeout | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The call to |
||
| print(f"Result: {the_data}") | ||
|
|
||
| except Exception as e: | ||
| print(f"Error: {e}") | ||
|
|
||
| return save_to_parquet(the_data) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| get_data_from_queue() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,12 +3,12 @@ | |
| from pathlib import Path | ||
| import json | ||
| import boto3 | ||
| import time | ||
| from celery import Celery, group, chord | ||
| from celery.utils.log import get_task_logger | ||
| from datetime import datetime | ||
| from github import Auth, Github, GithubException | ||
| from dotenv import load_dotenv | ||
| from client import get_data_from_queue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This import creates a circular dependency. |
||
| from pydantic_models.github import RabbitMQ_Data_Validation | ||
| from rb_queue.rabbitmq import get_connection, QUEUE_NAME | ||
| load_dotenv() | ||
|
|
@@ -183,6 +183,13 @@ def build_repo_chord(total: int = 5000, batch_size: int = 500): | |
| return chord(header)(aggregate_results.s()) | ||
|
|
||
|
|
||
| @app.task | ||
| def run_queue_and_save(total: int = 5000, batch_size: int = 500): | ||
| return get_data_from_queue(total=total, batch_size=batch_size) | ||
|
Comment on lines
+187
to
+188
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This task calls |
||
|
|
||
|
|
||
|
|
||
|
|
||
| # old code that did not work | ||
| # @app.task | ||
| # def distribute_tasks(): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are trying to send a task named
worker.get_data_from_queue, but there is no Celery task with this name. The functionget_data_from_queueinclient.pyis not decorated as a task. I believe you intended to call therun_queue_and_savetask defined inworker.py.