diff --git a/.DS_Store b/.DS_Store index a0e9b59..c08462d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/AGENTS.md b/AGENTS.md index e1f04e5..a4f38e0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ Podsumer is a self-hosted podcast aggregator (podcatcher) written in PHP with ze 4. **Constants**: UPPERCASE with underscores (e.g., `VERSION`, `PODSUMER_PATH`) 5. **Database Tables**: Plural snake_case (e.g., `feeds`, `items`, `file_contents`) 6. **Database Columns**: snake_case (e.g., `url_hash`, `last_update`) -7. **Configuration Keys**: snake_case (e.g., `store_media_on_disk`, `state_file`) +7. **Configuration Keys**: snake_case (e.g., `media_dir`, `state_file`) ### File Organization @@ -155,7 +155,7 @@ function feed(array $args): void ## Special Considerations -1. **Media Storage**: Can be configured for database or disk storage, but cannot be changed after library is established +1. **Media Storage**: All media files are stored on disk in the configured media_dir 2. **Performance**: Database operations use transactions and optimized PRAGMA settings 3. **Compatibility**: Requires SQLite 3.6.19+ with foreign key support 4. **Memory**: No memory limit set (`memory_limit = -1`) @@ -173,7 +173,6 @@ function feed(array $args): void 2. Generate coverage: `composer coverage` 3. View coverage locally: `composer coverage-dev` 4. Ensure no regression in test coverage -5. Test with both database and disk storage modes ## Contribution Behavior diff --git a/Dockerfile b/Dockerfile index 5c0120c..50b4042 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,32 +1,52 @@ -FROM php:8.2.12-apache-bookworm -RUN apt update && apt install -y git -RUN php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');" && \ - php -r "if (hash_file('sha384', 'composer-setup.php') === 'dac665fdc30fdd8ec78b38b9800061b4150413ff2e3b6f88543c636f7cd84f6db9189d43a81e5503cda447da73c7e5b6') { echo 'Installer verified'; } else { echo 'Installer corrupt'; unlink('composer-setup.php'); } echo PHP_EOL;" && \ - php composer-setup.php && \ - php -r "unlink('composer-setup.php');" && \ - mv composer.phar /usr/local/bin/composer +FROM php:8.2-apache + +# Install system dependencies RUN apt-get update && apt-get install -y \ - libzip-dev && \ - docker-php-ext-configure zip && \ - docker-php-ext-install -j$(nproc) zip + libsqlite3-dev \ + sqlite3 \ + ffmpeg \ + cron \ + && rm -rf /var/lib/apt/lists/* + +# Install PHP extensions +RUN docker-php-ext-install pdo pdo_sqlite + +# Enable Apache modules RUN a2enmod rewrite -RUN pecl install pcov && docker-php-ext-enable pcov -RUN mkdir -p /opt/podsumer/ -RUN mkdir -p /opt/podsumer/conf -COPY ./apache.conf /etc/apache2/sites-available/000-default.conf -COPY ./apache.conf /etc/apache2/sites-enabled/000-default.conf -COPY ./conf/podsumer.conf /opt/podsumer/conf/podsumer.conf -COPY ./sql /opt/podsumer/sql -COPY ./src /opt/podsumer/src -COPY ./templates /opt/podsumer/templates -COPY ./www /opt/podsumer/www -COPY ./composer.json /opt/podsumer/composer.json -COPY ./composer.lock /opt/podsumer/composer.lock -WORKDIR /opt/podsumer -RUN chown -R www-data:www-data /opt/podsumer -RUN chmod -R 755 /opt/podsumer -RUN chown -R www-data:www-data /etc/apache2/sites-available/000-default.conf -RUN chown -R www-data:www-data /etc/apache2/sites-enabled/000-default.conf -RUN composer dump-autoload -EXPOSE 3094 + +# Set working directory +WORKDIR /var/www/html + +# Copy application files +COPY . . + +# Install Composer +COPY --from=composer:latest /usr/bin/composer /usr/bin/composer + +# Install dependencies +RUN composer install --no-dev --optimize-autoloader + +# Set permissions +RUN chown -R www-data:www-data /var/www/html \ + && chmod -R 755 /var/www/html \ + && chmod +x /var/www/html/scripts/refresh_feeds.php + +# Create media directory +RUN mkdir -p /opt/media && chown -R www-data:www-data /opt/media + +# Configure Apache - fix the path to use the correct apache.conf file +COPY apache.conf /etc/apache2/sites-available/000-default.conf + +# Create a script to generate the crontab with the configured interval +RUN echo '#!/bin/bash\n\ +REFRESH_INTERVAL=$(php -r "include \"/var/www/html/conf/podsumer.conf\"; echo \$feed_refresh_interval ?? 6;")\n\ +echo "0 */${REFRESH_INTERVAL} * * * www-data /usr/local/bin/php /var/www/html/scripts/refresh_feeds.php >> /var/log/cron.log 2>&1" > /etc/cron.d/podsumer-cron\n\ +chmod 0644 /etc/cron.d/podsumer-cron\n\ +crontab /etc/cron.d/podsumer-cron\n\ +service cron start\n\ +apache2-foreground' > /usr/local/bin/start.sh \ + && chmod +x /usr/local/bin/start.sh + +# Start the container with our custom script +CMD ["/usr/local/bin/start.sh"] diff --git a/conf/podsumer.conf b/conf/podsumer.conf index cfca4e1..1f8178e 100755 --- a/conf/podsumer.conf +++ b/conf/podsumer.conf @@ -11,20 +11,9 @@ per_item_art_download = 33 items_per_page = 10 ssl = false -# File Storage Method +# Media Directory # -# By default all data is stored in the database. If you would prefer -# to have your audio files and artwork stored on disk set this to -# `true` and specify a path where you would like podsumer to save -# files. If you have a very large libary this should improve the -# performance and size of backups. -# -# NOTE: If this value is changed after a library is already established, -# only _new_ items and feeds will use the new storage backend. Previously -# downloaded files will remain where they are. - -store_media_on_disk = true - +# Directory where media files audio and images are stored. # Make sure podsumer has permission to read and write in this directory. # media_dir cannot be changed once library is established. If changed, some media # files may no longer be servable. @@ -33,3 +22,64 @@ media_dir = /opt/media playback_interval = 5 playback_rewind = 5 +# PodcastIndex API key and secret +# +# These are used to search for podcasts and episodes using the PodcastIndex API. +# +# You can get your own API key and secret by signing up at https://podcastindex.org/ +# +# Once you have your API key and secret, you can add them here. +podcastindex_key = "YOUR_PODCAST_INDEX_KEY_HERE" +podcastindex_secret = "YOUR_PODCAST_INDEX_SECRET_HERE" + +# Ad Blocking Configuration +# +# Enable automatic ad detection and removal from podcasts. +# This feature uses OpenAI Whisper for transcription and GPT-4o-mini for ad detection. +# Default is off to save on API costs. +# +# Note: This requires an OpenAI API key and will incur costs based on usage. +ad_blocking_enabled = true + +# OpenAI API key for transcription and ad detection +openai_api_key = "YOUR_OPENAI_API_KEY_HERE" + +# OpenAI model for ad detection +# Options: gpt-4o-mini cheaper, gpt-4o more accurate but expensive +openai_ad_detection_model = "gpt-4o" + +# Whether to use ffmpeg to strip ads from audio files +# If false, ads will be skipped in the web player instead +use_ffmpeg_ad_removal = false + +# Ad segment merge buffer in seconds +# Ad segments that are within this many seconds of each other will be merged +# This reduces confusion and makes skipping easier by combining closely spaced ads +# Default is 8 seconds - increase for more aggressive merging, decrease for more precision +ad_merge_buffer_seconds = 8 + +# OpenAI Pricing Configuration +# +# OpenAI pricing per 1000 tokens as of December 2024 +# Update these values as OpenAI changes their pricing +# Current pricing can be found at: https://openai.com/api/pricing/ +# +# Note: The system will use actual token counts from API responses for accurate cost calculation +# rather than estimating based on character count. +# +# Whisper pricing per minute of audio +openai_whisper_cost_per_minute = 0.006 + +# GPT-4o-mini pricing per 1000 tokens - Current as of Dec 2024 +openai_gpt4o_mini_input_cost_per_1k_tokens = 0.00015 +openai_gpt4o_mini_output_cost_per_1k_tokens = 0.0006 + +# GPT-4o pricing per 1000 tokens - in case you want to use a different model +openai_gpt4o_input_cost_per_1k_tokens = 0.0025 +openai_gpt4o_output_cost_per_1k_tokens = 0.01 + +# Feed Refresh Configuration +# +# How often to refresh feeds in hours +# Default is 6 hours +feed_refresh_interval = 1 diff --git a/conf/test.conf b/conf/test.conf index f520714..39249a8 100644 --- a/conf/test.conf +++ b/conf/test.conf @@ -14,3 +14,6 @@ ssl = false media_dir = /opt/media playback_interval = 5 playback_rewind = 5 + +podcastindex_key = "" +podcastindex_secret = "" diff --git a/scripts/refresh_feeds.php b/scripts/refresh_feeds.php new file mode 100755 index 0000000..af64031 --- /dev/null +++ b/scripts/refresh_feeds.php @@ -0,0 +1,403 @@ +#!/usr/bin/env php +getState()->updateJobLog($job_id, $message); + } +} + +function logError(Main $main, ?int $job_id, string $error): void { + $message = "ERROR: " . $error; + echo $message . "\n"; + if ($job_id) { + $main->getState()->updateJobLog($job_id, $message); + } +} + +// Set up error handlers to capture all errors in job log +if ($job_id) { + // Capture PHP errors, warnings, notices + set_error_handler(function($severity, $message, $file, $line) use ($main, $job_id) { + $error_types = [ + E_ERROR => 'FATAL ERROR', + E_WARNING => 'WARNING', + E_NOTICE => 'NOTICE', + E_USER_ERROR => 'USER ERROR', + E_USER_WARNING => 'USER WARNING', + E_USER_NOTICE => 'USER NOTICE', + E_STRICT => 'STRICT', + E_RECOVERABLE_ERROR => 'RECOVERABLE ERROR', + E_DEPRECATED => 'DEPRECATED', + E_USER_DEPRECATED => 'USER DEPRECATED' + ]; + + $error_type = $error_types[$severity] ?? 'UNKNOWN ERROR'; + $error_msg = "$error_type: $message in $file on line $line"; + logError($main, $job_id, $error_msg); + + // Don't execute PHP internal error handler + return true; + }); + + // Capture fatal errors and exceptions + register_shutdown_function(function() use ($main, $job_id) { + $error = error_get_last(); + if ($error && in_array($error['type'], [E_ERROR, E_PARSE, E_CORE_ERROR, E_COMPILE_ERROR])) { + $error_msg = "FATAL ERROR: {$error['message']} in {$error['file']} on line {$error['line']}"; + logError($main, $job_id, $error_msg); + + // Mark job as failed + try { + $main->getState()->failJob($job_id, $error_msg); + } catch (Exception $e) { + // If we can't even log the failure, write to error log as last resort + error_log("Failed to log job failure: " . $e->getMessage()); + } + } + }); + + // Capture uncaught exceptions + set_exception_handler(function($exception) use ($main, $job_id) { + $error_msg = "UNCAUGHT EXCEPTION: " . $exception->getMessage() . " in " . $exception->getFile() . " on line " . $exception->getLine(); + $error_msg .= "\nStack trace:\n" . $exception->getTraceAsString(); + logError($main, $job_id, $error_msg); + + // Mark job as failed + try { + $main->getState()->failJob($job_id, $error_msg); + } catch (Exception $e) { + error_log("Failed to log job failure: " . $e->getMessage()); + } + + exit(1); + }); +} + +function downloadItemAudio(Main $main, int $item_id, ?int $job_id = null): bool { + try { + $item = $main->getState()->getFeedItem($item_id); + if (empty($item)) { + logError($main, $job_id, "Item not found: $item_id"); + return false; + } + + if (!empty($item['audio_file'])) { + logMessage($main, $job_id, "Item already has audio downloaded: " . $item['name']); + return true; + } + + if (empty($item['audio_url'])) { + logError($main, $job_id, "No audio URL available for item: " . $item['name']); + return false; + } + + logMessage($main, $job_id, "Downloading audio for: " . $item['name']); + + $feed = $main->getState()->getFeedForItem($item_id); + $file = new File($main); + $file_id = $file->cacheUrl($item['audio_url'], $feed); + + $main->getState()->setItemAudioFile($item_id, $file_id); + + logMessage($main, $job_id, "Successfully downloaded audio for: " . $item['name']); + return true; + + } catch (Exception $e) { + logError($main, $job_id, "Error downloading audio for item $item_id: " . $e->getMessage()); + return false; + } +} + +function getItemsToDownload(Main $main, int $feed_id, ?int $job_id = null): array { + $items = $main->getState()->getFeedItems($feed_id); + + if (empty($items)) { + logMessage($main, $job_id, "No items found for feed $feed_id"); + return []; + } + + // Find items that have audio downloaded + $downloaded_items = array_filter($items, function($item) { + return !empty($item['audio_file']); + }); + + if (empty($downloaded_items)) { + // No items downloaded - download the first (newest) episode + logMessage($main, $job_id, "No episodes downloaded, will download first episode"); + return [$items[0]]; + } + + // Find the newest downloaded item by published date + $newest_downloaded = null; + $newest_downloaded_time = null; + + foreach ($downloaded_items as $item) { + $item_time = strtotime($item['published']); + if ($newest_downloaded_time === null || $item_time > $newest_downloaded_time) { + $newest_downloaded = $item; + $newest_downloaded_time = $item_time; + } + } + + // Find all items newer than the newest downloaded item + $items_to_download = []; + foreach ($items as $item) { + $item_time = strtotime($item['published']); + if (empty($item['audio_file']) && $item_time > $newest_downloaded_time) { + $items_to_download[] = $item; + } + } + + if (empty($items_to_download)) { + logMessage($main, $job_id, "No new episodes to download"); + } else { + logMessage($main, $job_id, "Found " . count($items_to_download) . " new episodes to download"); + } + + return $items_to_download; +} + +function detectIfNewFeed(Main $main, int $feed_id, ?int $job_id = null): bool { + $items = $main->getState()->getFeedItems($feed_id); + + if (empty($items)) { + return true; // No items means new feed + } + + // Check if ANY item has been downloaded + foreach ($items as $item) { + if (!empty($item['audio_file'])) { + return false; // Found a downloaded item, not a new feed + } + } + + return true; // No downloaded items, treat as new feed +} + +try { + if ($job_id) { + logMessage($main, $job_id, "Job started with ID: $job_id"); + if ($feed_id) { + logMessage($main, $job_id, "Processing feed ID: $feed_id"); + } + if ($item_id) { + logMessage($main, $job_id, "Processing item ID: $item_id"); + } + } + + # Handle download-only for specific item + if ($item_id !== null && $download_only) { + logMessage($main, $job_id, "Starting download for item $item_id"); + + $success = downloadItemAudio($main, $item_id, $job_id); + + if ($success) { + logMessage($main, $job_id, "Download completed successfully"); + if ($job_id) { + $main->getState()->completeJob($job_id, 0); + } + } else { + throw new Exception("Failed to download item $item_id"); + } + + exit(0); + } + + # Handle ad detection for specific item + if ($item_id !== null) { + logMessage($main, $job_id, "Starting ad detection for item $item_id"); + + $item = $main->getState()->getFeedItem($item_id); + if (empty($item)) { + throw new Exception("Item not found: $item_id"); + } + + logMessage($main, $job_id, "Found item: " . $item['name']); + + if (empty($item['audio_file'])) { + throw new Exception("No audio file available for item: $item_id"); + } + + logMessage($main, $job_id, "Audio file available, starting ad detection"); + + $adDetection = new AdDetection($main); + + # Get the audio file + $file_data = $main->getState()->getFileById($item['audio_file']); + + logMessage($main, $job_id, "Retrieved audio file data"); + + # Save audio to temporary file for processing + $temp_file = tempnam(sys_get_temp_dir(), 'podsumer_audio_'); + + # Determine file extension from mimetype + $mimetype = explode(';', $file_data['mimetype'])[0]; + $extension = match($mimetype) { + 'audio/mpeg', 'audio/mp3' => '.mp3', + 'audio/mp4' => '.mp4', + 'audio/m4a', 'audio/x-m4a' => '.m4a', + 'audio/wav', 'audio/x-wav' => '.wav', + 'audio/ogg' => '.ogg', + 'audio/flac' => '.flac', + 'audio/webm' => '.webm', + default => '.mp3' + }; + + $audio_file_path = $temp_file . $extension; + rename($temp_file, $audio_file_path); + + logMessage($main, $job_id, "Created temporary audio file: $audio_file_path"); + + file_put_contents($audio_file_path, $file_data['data']); + + logMessage($main, $job_id, "Written audio data to temporary file"); + + # Process ads + $cost = $adDetection->processItem($item_id, $audio_file_path); + $openai_cost += $cost; + + logMessage($main, $job_id, "Ad detection completed. Cost: $" . number_format($cost, 4)); + + # Clean up temporary file + unlink($audio_file_path); + + logMessage($main, $job_id, "Cleaned up temporary file"); + + if ($job_id) { + $main->getState()->completeJob($job_id, $openai_cost); + logMessage($main, $job_id, "Job completed successfully"); + } + + exit(0); + } + + # Handle feed refresh + if ($feed_id !== null) { + $feed = $main->getState()->getFeed($feed_id); + if ($feed) { + $feeds = [$feed]; + logMessage($main, $job_id, "Processing single feed: " . $feed['name']); + } else { + throw new Exception("Feed not found: $feed_id"); + } + } else { + $feeds = $main->getState()->getFeeds(); + logMessage($main, $job_id, "Processing all feeds: " . count($feeds) . " feeds found"); + } + + $total_feeds = count($feeds); + $processed_feeds = 0; + + foreach ($feeds as $feed) { + try { + logMessage($main, $job_id, "Processing feed: " . $feed['name'] . " (ID: " . $feed['id'] . ")"); + + + + # Refresh feed + logMessage($main, $job_id, "Creating Feed object for URL: " . $feed['url']); + $refresh_feed = new Feed($feed['url']); + $refresh_feed->setFeedId($feed['id']); + + logMessage($main, $job_id, "Adding feed to database"); + $main->getState()->addFeed($refresh_feed); + + logMessage($main, $job_id, "Feed refresh completed for: " . $feed['name']); + + # Automatic downloading logic + $is_new_feed = detectIfNewFeed($main, $feed['id'], $job_id); + + if ($is_new_feed) { + logMessage($main, $job_id, "New feed detected, will download first episode"); + $items = $main->getState()->getFeedItems($feed['id']); + if (!empty($items)) { + $success = downloadItemAudio($main, $items[0]['id'], $job_id); + if ($success) { + logMessage($main, $job_id, "Successfully downloaded first episode for new feed"); + } else { + logMessage($main, $job_id, "Failed to download first episode for new feed"); + } + } else { + logMessage($main, $job_id, "No items found for new feed"); + } + } else { + logMessage($main, $job_id, "Existing feed, checking for new episodes to download"); + $items_to_download = getItemsToDownload($main, $feed['id'], $job_id); + + if (!empty($items_to_download)) { + $download_count = 0; + foreach ($items_to_download as $item) { + $success = downloadItemAudio($main, $item['id'], $job_id); + if ($success) { + $download_count++; + } + } + logMessage($main, $job_id, "Successfully downloaded $download_count out of " . count($items_to_download) . " new episodes"); + } else { + logMessage($main, $job_id, "No new episodes to download"); + } + } + + logMessage($main, $job_id, "Feed refresh and download completed, ad processing will be handled separately"); + + $processed_feeds++; + logMessage($main, $job_id, "Completed processing feed: " . $feed['name'] . " ($processed_feeds/$total_feeds)"); + + } catch (Exception $e) { + $error_msg = "Error processing feed " . $feed['name'] . ": " . $e->getMessage(); + logError($main, $job_id, $error_msg); + + if ($job_id) { + $main->getState()->failJob($job_id, $error_msg); + } + continue; + } + } + + if ($job_id) { + $main->getState()->completeJob($job_id, $openai_cost); + logMessage($main, $job_id, "All feeds processed successfully. Total cost: $" . number_format($openai_cost, 4)); + } + +} catch (Exception $e) { + $error_msg = "Fatal error: " . $e->getMessage(); + logError($main, $job_id, $error_msg); + + if ($job_id) { + $main->getState()->failJob($job_id, $error_msg); + } + + exit(1); +} \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..aa86dc2 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/Brickner/.DS_Store b/src/Brickner/.DS_Store new file mode 100644 index 0000000..9558795 Binary files /dev/null and b/src/Brickner/.DS_Store differ diff --git a/src/Brickner/Podsumer/AdDetection.php b/src/Brickner/Podsumer/AdDetection.php new file mode 100644 index 0000000..4bec4a8 --- /dev/null +++ b/src/Brickner/Podsumer/AdDetection.php @@ -0,0 +1,1549 @@ +main = $main; + $this->api_key = strval($this->main->getConf('podsumer', 'openai_api_key')); + + if (empty($this->api_key)) { + throw new Exception('OpenAI API key not configured'); + } + } + + /** + * Get the ad detection model from configuration + */ + protected function getAdDetectionModel(): string + { + return strval($this->main->getConf('podsumer', 'openai_ad_detection_model') ?? self::AD_DETECTION_MODEL); + } + + /** + * Get model information from OpenAI API + */ + protected function getModelInfo(?string $model = null): array + { + if ($model === null) { + $model = $this->getAdDetectionModel(); + } + + // Return cached info if available + if ($this->model_info_cache !== null && isset($this->model_info_cache['id']) && $this->model_info_cache['id'] === $model) { + return $this->model_info_cache; + } + + try { + $ch = curl_init(); + + curl_setopt($ch, CURLOPT_URL, "https://api.openai.com/v1/models/{$model}"); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Authorization: Bearer ' . $this->api_key, + 'Content-Type: application/json' + ]); + curl_setopt($ch, CURLOPT_TIMEOUT, 30); + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); + + $response = curl_exec($ch); + $curl_error = curl_error($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($curl_error) { + throw new Exception("cURL error fetching model info: $curl_error"); + } + + if ($httpCode !== 200) { + throw new Exception("OpenAI API error fetching model info (HTTP $httpCode): $response"); + } + + $result = json_decode($response, true); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception('Invalid JSON response from OpenAI API: ' . json_last_error_msg()); + } + + // Cache the result + $this->model_info_cache = $result; + + return $result; + + } catch (Exception $e) { + $this->main->log("Failed to fetch model info for {$model}: " . $e->getMessage()); + + // Fall back to known defaults for common models + $defaults = [ + 'gpt-4o-mini' => ['context_window' => 128000], + 'gpt-4o' => ['context_window' => 128000], + 'gpt-4-turbo' => ['context_window' => 128000], + 'gpt-4' => ['context_window' => 8192], + 'gpt-3.5-turbo' => ['context_window' => 16384], + ]; + + if (isset($defaults[$model])) { + $this->main->log("Using fallback context window for {$model}: " . $defaults[$model]['context_window']); + $this->model_info_cache = [ + 'id' => $model, + 'context_window' => $defaults[$model]['context_window'] + ]; + return $this->model_info_cache; + } + + // Ultimate fallback + $this->main->log("Using ultimate fallback context window: 128000"); + $this->model_info_cache = [ + 'id' => $model, + 'context_window' => 128000 + ]; + return $this->model_info_cache; + } + } + + /** + * Get the maximum context window for the model + */ + protected function getModelContextLimit(?string $model = null): int + { + $modelInfo = $this->getModelInfo($model); + + // Try different possible keys for context window + if (isset($modelInfo['context_window'])) { + return intval($modelInfo['context_window']); + } elseif (isset($modelInfo['max_tokens'])) { + return intval($modelInfo['max_tokens']); + } elseif (isset($modelInfo['context_length'])) { + return intval($modelInfo['context_length']); + } + + // Fallback to a reasonable default + $this->main->log("Could not determine context limit for {$model}, using fallback: 128000"); + return 128000; + } + + /** + * Calculate optimal chunk size based on model context limit + */ + protected function calculateOptimalChunkSize(int $contextLimit): int + { + // Reserve tokens for: + // - System prompt (~100 tokens) + // - User prompt template (~500 tokens - increased from 200 due to longer prompt) + // - Response (~1000 tokens for ad detection JSON) + // - Safety buffer (~1000 tokens for overlap and safety) + $reservedTokens = 2600; + + $availableTokens = $contextLimit - $reservedTokens; + + // Use 25% of model's maximum context as requested by user (reduced from 70% of available) + // This is more conservative and allows for better processing of smaller chunks + $maxChunkTokens = intval($contextLimit * 0.25); + + // Ensure we don't exceed available tokens after reservations + $maxChunkTokens = min($maxChunkTokens, $availableTokens); + + $this->main->log("Model context limit: {$contextLimit}, Using 25% for chunks: {$maxChunkTokens} tokens"); + + return max(4000, $maxChunkTokens); // Minimum 4k tokens to ensure useful chunks + } + + /** + * Get audio file duration using ffmpeg + */ + protected function getAudioDuration(string $file_path): float + { + // Check if ffprobe is available + $checkCmd = 'which ffprobe 2>/dev/null'; + $ffprobePath = shell_exec($checkCmd); + if ($ffprobePath === null || trim($ffprobePath) === '') { + throw new Exception('ffprobe not found. Please ensure ffmpeg is installed.'); + } + + $cmd = sprintf( + 'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 %s', + escapeshellarg($file_path) + ); + + $output = shell_exec($cmd); + if ($output === null || !is_numeric(trim($output))) { + throw new Exception('Could not determine audio duration'); + } + return floatval(trim($output)); + } + + /** + * Split audio file into chunks + */ + protected function splitAudioFile(string $input_file): array + { + $chunks = []; + $duration = $this->getAudioDuration($input_file); + + // Use smaller chunks for better reliability (5 minutes max) + $chunkDuration = min(self::CHUNK_DURATION_SECONDS, self::MAX_CHUNK_DURATION_SECONDS); + $num_chunks = ceil($duration / $chunkDuration); + + $this->main->log("Splitting audio file: duration={$duration}s, chunk_duration={$chunkDuration}s, num_chunks={$num_chunks}"); + + // Detect input file format + $pathInfo = pathinfo($input_file); + $extension = isset($pathInfo['extension']) ? strtolower($pathInfo['extension']) : 'mp3'; + + // Map common extensions to codec settings + $codecSettings = match($extension) { + 'mp3' => '-c:a libmp3lame -b:a 128k', + 'mp4', 'm4a' => '-c:a aac -b:a 128k', + 'ogg' => '-c:a libvorbis -b:a 128k', + 'flac' => '-c:a flac', + 'wav' => '-c:a pcm_s16le', + default => '-c:a libmp3lame -b:a 128k' // Default to mp3 + }; + + for ($i = 0; $i < $num_chunks; $i++) { + $start_time = $i * $chunkDuration; + $chunk_file = tempnam(sys_get_temp_dir(), 'podsumer_chunk_') . '.' . $extension; + + $cmd = sprintf( + 'ffmpeg -i %s -ss %d -t %d %s -y %s 2>&1', + escapeshellarg($input_file), + $start_time, + $chunkDuration, + $codecSettings, + escapeshellarg($chunk_file) + ); + + exec($cmd, $output, $returnVar); + + if ($returnVar !== 0) { + // Clean up any created chunks on error + foreach ($chunks as $chunk) { + if (isset($chunk['file']) && file_exists($chunk['file'])) { + @unlink($chunk['file']); + } + } + throw new Exception('Failed to split audio file: ' . implode("\n", $output)); + } + + $chunks[] = [ + 'file' => $chunk_file, + 'start_offset' => $start_time + ]; + } + + return $chunks; + } + + /** + * Transcribe a single audio file chunk + */ + protected function transcribeChunk(string $audio_file_path): array + { + try { + if (!file_exists($audio_file_path)) { + throw new Exception("Audio file not found: $audio_file_path"); + } + + if (!is_readable($audio_file_path)) { + throw new Exception("Audio file not readable: $audio_file_path"); + } + + // Check file size and log warning if it's approaching limits + $fileSize = filesize($audio_file_path); + $this->main->log("Transcribing chunk: " . basename($audio_file_path) . " (size: " . number_format($fileSize / 1024 / 1024, 2) . " MB)"); + + if ($fileSize > self::WHISPER_FILE_SIZE_LIMIT) { + $this->main->log("WARNING: Chunk file size (" . number_format($fileSize / 1024 / 1024, 2) . " MB) exceeds recommended limit"); + } + + $ch = curl_init(); + + $postData = [ + 'file' => new \CURLFile($audio_file_path), + 'model' => 'whisper-1', + 'response_format' => 'verbose_json', + 'timestamp_granularities' => ['segment'] + ]; + + curl_setopt($ch, CURLOPT_URL, 'https://api.openai.com/v1/audio/transcriptions'); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Authorization: Bearer ' . $this->api_key + ]); + // Increase timeout to 10 minutes for large files, with longer connection timeout + curl_setopt($ch, CURLOPT_TIMEOUT, 600); // 10 minute timeout + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60); // 60 second connection timeout + // Add progress callback to prevent timeout on slow uploads + curl_setopt($ch, CURLOPT_NOPROGRESS, false); + curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function($resource, $download_size, $downloaded, $upload_size, $uploaded) { + // This prevents timeout during long uploads by keeping the connection active + return 0; + }); + + $response = curl_exec($ch); + $curl_error = curl_error($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($curl_error) { + throw new Exception("cURL error during transcription: $curl_error"); + } + + if ($httpCode !== 200) { + throw new Exception("Whisper API error (HTTP $httpCode): $response"); + } + + $result = json_decode($response, true); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception('Invalid JSON response from Whisper API: ' . json_last_error_msg()); + } + + if (!isset($result['segments'])) { + throw new Exception('Whisper API response missing segments data'); + } + + return $result; + + } catch (Exception $e) { + $this->main->log("Transcription error for file $audio_file_path: " . $e->getMessage()); + throw new Exception("Audio transcription failed: " . $e->getMessage(), 0, $e); + } + } + + /** + * Merge transcripts from multiple chunks + */ + protected function mergeTranscripts(array $chunk_transcripts): array + { + $merged = [ + 'text' => '', + 'segments' => [], + 'language' => $chunk_transcripts[0]['language'] ?? 'en' + ]; + + foreach ($chunk_transcripts as $index => $transcript) { + $offset = $transcript['offset']; + + // Add text with space separator + if (!empty($merged['text'])) { + $merged['text'] .= ' '; + } + $transcriptText = isset($transcript['text']) ? strval($transcript['text']) : ''; + $merged['text'] .= $transcriptText; + + // Adjust segment timestamps and merge + if (isset($transcript['segments']) && is_array($transcript['segments'])) { + foreach ($transcript['segments'] as $segment) { + if (isset($segment['start']) && isset($segment['end'])) { + $adjustedSegment = $segment; + $adjustedSegment['start'] = floatval($segment['start']) + $offset; + $adjustedSegment['end'] = floatval($segment['end']) + $offset; + $merged['segments'][] = $adjustedSegment; + } + } + } + } + + return $merged; + } + + /** + * Transcribe audio file using OpenAI Whisper API + * + * @param string $audio_file_path Path to the audio file + * @return array Transcript with timestamps + */ + public function transcribeAudio(string $audio_file_path): array + { + $this->last_transcription_cost = $this->calculateTranscriptionCost($audio_file_path); + + $fileSize = filesize($audio_file_path); + + // If file is small enough, transcribe directly + if ($fileSize <= self::WHISPER_FILE_SIZE_LIMIT) { + return $this->transcribeChunk($audio_file_path); + } + + // File is too large, split into chunks + $this->main->log("Audio file too large ({$fileSize} bytes), splitting into chunks..."); + + $chunks = $this->splitAudioFile($audio_file_path); + $chunk_transcripts = []; + + try { + foreach ($chunks as $index => $chunk) { + $this->main->log("Transcribing chunk " . ($index + 1) . " of " . count($chunks)); + + // Add retry logic for individual chunks + $maxRetries = 2; + $transcript = null; + $lastError = null; + + for ($retry = 0; $retry <= $maxRetries; $retry++) { + try { + $transcript = $this->transcribeChunk($chunk['file']); + $transcript['offset'] = $chunk['start_offset']; + break; // Success, exit retry loop + } catch (Exception $e) { + $lastError = $e; + if ($retry < $maxRetries) { + $this->main->log("Transcription failed for chunk " . ($index + 1) . ", retrying... (attempt " . ($retry + 2) . "/" . ($maxRetries + 1) . ")"); + sleep(2); // Wait 2 seconds before retry + } else { + $this->main->log("Transcription failed for chunk " . ($index + 1) . " after " . ($maxRetries + 1) . " attempts"); + } + } + } + + if ($transcript === null) { + throw new Exception("Failed to transcribe chunk after " . ($maxRetries + 1) . " attempts: " . $lastError->getMessage()); + } + + $chunk_transcripts[] = $transcript; + + // Clean up chunk file + if (file_exists($chunk['file'])) { + @unlink($chunk['file']); + $this->main->log("Cleaned up chunk file: " . basename($chunk['file'])); + } + } + + // Merge all transcripts + return $this->mergeTranscripts($chunk_transcripts); + + } catch (Exception $e) { + // Clean up any remaining chunk files on error + foreach ($chunks as $chunk) { + if (isset($chunk['file']) && file_exists($chunk['file'])) { + @unlink($chunk['file']); + $this->main->log("Cleaned up chunk file on error: " . basename($chunk['file'])); + } + } + throw $e; + } + } + + /** + * Detect ad sections in transcript using GPT-4o-mini + * + * @param array $transcript Transcript data from Whisper + * @param string $show Show/feed title + * @param string $episode Episode/item title + * @return array Ad sections with start/end timestamps + */ + public function detectAds(array $transcript, string $show, string $episode): array + { + try { + $segments = $transcript['segments'] ?? []; + if (empty($segments)) { + return []; + } + + // Check if transcript is too large and needs to be chunked + $transcriptText = $this->formatTranscriptText($segments); + + // Get the model's actual context limit and calculate optimal chunk size + $contextLimit = $this->getModelContextLimit(); + $maxTokensPerChunk = $this->calculateOptimalChunkSize($contextLimit); + + // More conservative token estimation: ~3 characters per token for formatted text + // (reduced from 4 to be more conservative due to transcript formatting) + $estimatedTokens = intval(strlen($transcriptText) / 3); + + $this->main->log("Transcript length: " . strlen($transcriptText) . " characters, estimated tokens: {$estimatedTokens}, max per chunk: {$maxTokensPerChunk} (25% of context)"); + + $adSections = []; + + if ($estimatedTokens > $maxTokensPerChunk) { + $this->main->log("Large transcript detected ({$estimatedTokens} estimated tokens), processing with overlapping chunks (max per chunk: {$maxTokensPerChunk}, 5% overlap)"); + $adSections = $this->detectAdsInChunks($segments, $maxTokensPerChunk, $show, $episode); + } else { + $this->main->log("Processing transcript in single request ({$estimatedTokens} estimated tokens, within chunk limit: {$maxTokensPerChunk})"); + $adSections = $this->detectAdsInSingleRequest($transcriptText, $show, $episode); + } + + // Refine ad boundaries if sections were found + if (!empty($adSections)) { + $this->main->log("Refining ad boundaries for " . count($adSections) . " detected sections"); + + // Before refinement, merge sections that are within 33 seconds of each other + // This helps when there are multiple ads in the 6-minute refinement window + $adSectionsForRefinement = $this->mergeOverlappingAdSections($adSections, 33.0); + $this->main->log("Merged close sections for refinement: " . count($adSections) . " => " . count($adSectionsForRefinement) . " sections (33s merge buffer)"); + + $adSections = $this->refineAdBoundaries($adSectionsForRefinement, $transcript); + } + + return $adSections; + + } catch (Exception $e) { + $this->main->log("Ad detection error: " . $e->getMessage()); + throw new Exception("Ad detection failed: " . $e->getMessage(), 0, $e); + } + } + + /** + * Format transcript segments into text with timestamps + */ + protected function formatTranscriptText(array $segments): string + { + $transcriptText = ''; + foreach ($segments as $segment) { + $start = floatval($segment['start']); + $end = floatval($segment['end']); + $startFormatted = gmdate('H:i:s', intval($start)); + $endFormatted = gmdate('H:i:s', intval($end)); + $text = isset($segment['text']) && is_string($segment['text']) ? trim($segment['text']) : ''; + if (!empty($text)) { + $transcriptText .= "[{$startFormatted} - {$endFormatted}] (start: {$start}s, end: {$end}s) {$text}\n"; + } + } + return $transcriptText; + } + + /** + * Process large transcripts by splitting them into chunks + */ + protected function detectAdsInChunks(array $segments, int $maxTokensPerChunk, string $show, string $episode): array + { + $allAdSections = []; + $totalCost = 0.0; + + // Find the total duration + $totalDuration = 0; + foreach ($segments as $segment) { + $end = floatval($segment['end']); + if ($end > $totalDuration) { + $totalDuration = $end; + } + } + + // Create chunks based on token count with 5% overlap on each side + $chunks = $this->createOptimalChunks($segments, $maxTokensPerChunk); + $numChunks = count($chunks); + + $this->main->log("Splitting transcript into {$numChunks} overlapping chunks (max {$maxTokensPerChunk} tokens per chunk, 5% overlap each side)"); + + foreach ($chunks as $i => $chunk) { + $chunkSegments = $chunk['segments']; + $chunkStart = $chunk['start_time']; + $chunkEnd = $chunk['end_time']; + $estimatedTokens = $chunk['estimated_tokens']; + + $chunkText = $this->formatTranscriptText($chunkSegments); + $this->main->log("Processing chunk " . ($i + 1) . "/{$numChunks} (duration: " . gmdate('H:i:s', intval($chunkStart)) . " - " . gmdate('H:i:s', intval($chunkEnd)) . ", ~{$estimatedTokens} tokens, " . count($chunkSegments) . " segments)"); + + $chunkAdSections = $this->detectAdsInSingleRequest($chunkText, $show, $episode); + + // Add the chunk's ad sections to the overall results + $allAdSections = array_merge($allAdSections, $chunkAdSections); + + $totalCost += $this->last_detection_cost; + } + + // Update the total cost + $this->last_detection_cost = $totalCost; + + // Sort and merge overlapping ad sections + $mergedAdSections = $this->mergeOverlappingAdSections($allAdSections); + + $this->main->log("Found " . count($mergedAdSections) . " ad sections after processing {$numChunks} overlapping chunks"); + + return $mergedAdSections; + } + + /** + * Create optimal chunks based on token limits with 5% overlap on each side + */ + protected function createOptimalChunks(array $segments, int $maxTokensPerChunk): array + { + if (empty($segments)) { + return []; + } + + $chunks = []; + $totalSegments = count($segments); + + // Calculate 5% overlap in terms of tokens + $overlapTokens = intval($maxTokensPerChunk * 0.05); + + // Use 90% of max chunk size as the target to leave room for overlap + $targetChunkTokens = intval($maxTokensPerChunk * 0.90); + + $currentSegmentIndex = 0; + + while ($currentSegmentIndex < $totalSegments) { + $chunk = [ + 'segments' => [], + 'start_time' => 0, + 'end_time' => 0, + 'estimated_tokens' => 0 + ]; + + // Add segments starting from current index + $chunkTokens = 0; + $segmentIndex = $currentSegmentIndex; + + // Build the core chunk + while ($segmentIndex < $totalSegments) { + $segment = $segments[$segmentIndex]; + $segmentText = isset($segment['text']) && is_string($segment['text']) ? trim($segment['text']) : ''; + + if (empty($segmentText)) { + $segmentIndex++; + continue; + } + + // Estimate tokens for this segment (including timestamp formatting) + $start = floatval($segment['start']); + $end = floatval($segment['end']); + $startFormatted = gmdate('H:i:s', intval($start)); + $endFormatted = gmdate('H:i:s', intval($end)); + $formattedSegment = "[{$startFormatted} - {$endFormatted}] (start: {$start}s, end: {$end}s) {$segmentText}\n"; + + // Conservative token estimation: ~3 chars per token for formatted text + $segmentTokens = intval(strlen($formattedSegment) / 3); + + // Check if adding this segment would exceed our target + if ($chunkTokens + $segmentTokens > $targetChunkTokens && !empty($chunk['segments'])) { + break; + } + + // Add segment to chunk + $chunk['segments'][] = $segment; + $chunkTokens += $segmentTokens; + + // Update chunk time boundaries + if ($chunk['start_time'] == 0) { + $chunk['start_time'] = $start; + } + $chunk['end_time'] = $end; + + $segmentIndex++; + } + + // If this is not the first chunk, add overlap from previous segments (5% overlap on left side) + if ($currentSegmentIndex > 0) { + $leftOverlapSegments = []; + $leftOverlapTokens = 0; + $leftIndex = $currentSegmentIndex - 1; + + // Go backwards to collect segments for left overlap + while ($leftIndex >= 0 && $leftOverlapTokens < $overlapTokens) { + $segment = $segments[$leftIndex]; + $segmentText = isset($segment['text']) && is_string($segment['text']) ? trim($segment['text']) : ''; + + if (!empty($segmentText)) { + $start = floatval($segment['start']); + $end = floatval($segment['end']); + $startFormatted = gmdate('H:i:s', intval($start)); + $endFormatted = gmdate('H:i:s', intval($end)); + $formattedSegment = "[{$startFormatted} - {$endFormatted}] (start: {$start}s, end: {$end}s) {$segmentText}\n"; + $segmentTokens = intval(strlen($formattedSegment) / 3); + + // Check if we can fit this segment in the overlap + if ($leftOverlapTokens + $segmentTokens <= $overlapTokens) { + array_unshift($leftOverlapSegments, $segment); + $leftOverlapTokens += $segmentTokens; + $chunk['start_time'] = $start; // Update start time for overlap + } else { + break; + } + } + + $leftIndex--; + } + + // Prepend overlap segments to the chunk + $chunk['segments'] = array_merge($leftOverlapSegments, $chunk['segments']); + $chunkTokens += $leftOverlapTokens; + } + + // If this is not the last chunk, add overlap from next segments (5% overlap on right side) + if ($segmentIndex < $totalSegments) { + $rightOverlapSegments = []; + $rightOverlapTokens = 0; + $rightIndex = $segmentIndex; + + // Go forwards to collect segments for right overlap + while ($rightIndex < $totalSegments && $rightOverlapTokens < $overlapTokens) { + $segment = $segments[$rightIndex]; + $segmentText = isset($segment['text']) && is_string($segment['text']) ? trim($segment['text']) : ''; + + if (!empty($segmentText)) { + $start = floatval($segment['start']); + $end = floatval($segment['end']); + $startFormatted = gmdate('H:i:s', intval($start)); + $endFormatted = gmdate('H:i:s', intval($end)); + $formattedSegment = "[{$startFormatted} - {$endFormatted}] (start: {$start}s, end: {$end}s) {$segmentText}\n"; + $segmentTokens = intval(strlen($formattedSegment) / 3); + + // Check if we can fit this segment in the overlap + if ($rightOverlapTokens + $segmentTokens <= $overlapTokens) { + $rightOverlapSegments[] = $segment; + $rightOverlapTokens += $segmentTokens; + $chunk['end_time'] = $end; // Update end time for overlap + } else { + break; + } + } + + $rightIndex++; + } + + // Append overlap segments to the chunk + $chunk['segments'] = array_merge($chunk['segments'], $rightOverlapSegments); + $chunkTokens += $rightOverlapTokens; + } + + $chunk['estimated_tokens'] = $chunkTokens; + + if (!empty($chunk['segments'])) { + $chunks[] = $chunk; + } + + // Move to the next chunk starting point + // The next chunk starts where the core content of this chunk ended (not including right overlap) + $currentSegmentIndex = $segmentIndex; + } + + return $chunks; + } + + /** + * Process transcript in a single API request + */ + protected function detectAdsInSingleRequest(string $transcriptText, string $show, string $episode): array + { + $prompt = "Your job is to read each and every line in the following podcast transcript excerpt and judge, considering the lines before and after each line, if that line is a part of an advertisement, a sponsorsed segment, or native advertising.: An advertisement is when a paid product, recurring subscription, corporation, company, or non-profit is being explicitly promoted. Many hosts and guests talk about themselves and their output, this is not to be considered a promotion. If the host or guest is talking about call to actions for paid events, merch sales, donations, monetary contributions, recurring subscriptions, patreon, super chats, etc. those are categorically advertisements. Mentions by name of corporations at the very beginning or end are ads. When you find an entry in the transcript you suspect might be a match search surrounding entries for at least 30-45 seconds to see where the real content ends and when it starts again. PINPOINT THE EXACT RANGE OF THE MATCHED CONTENT. \n\nReturn a JSON object with a key 'segments' containing an array of objects with 'start' and 'end' timestamps (in seconds) for each matched section. When you've found the entries that match, use the end timestamp from the entry immediately preceding the first entry in the matched section as the START value. For the END value, use the start timestamp from the transcript entry immediately following the last transcript entry in the matched section. Also include a reason key with a brief description of why this was matched. Example format: {\"segments\": [{\"start\": 0, \"end\": 30, \"reason\": \"\"}, {\"start\": 600, \"end\": 660, \"reason\": \"\"}]}. Show name: {$show}\n\n Episode Title: {$episode}\n\n \n\nTranscript:\n```{$transcriptText}```"; + + $ch = curl_init(); + + $model = $this->getAdDetectionModel(); + + $postData = json_encode([ + 'model' => $model, + 'messages' => [ + [ + 'role' => 'system', + 'content' => 'You are an anti advertising activist. Your movement has won unanimous favor in society. You are now the national czar for removing ads from podcasts. The highest honor of your dreams. You can spot the beginning and end of an ad like a hawk.' + ], + [ + 'role' => 'user', + 'content' => $prompt + ] + ], + 'temperature' => 0.3, + 'response_format' => [ + 'type' => 'json_schema', + 'json_schema' => [ + 'name' => 'ad_detection_response', + 'strict' => true, + 'schema' => [ + 'type' => 'object', + 'properties' => [ + 'segments' => [ + 'type' => 'array', + 'items' => [ + 'type' => 'object', + 'properties' => [ + 'start' => ['type' => 'number'], + 'end' => ['type' => 'number'], + 'reason' => ['type' => 'string'] + ], + 'required' => ['start', 'end', 'reason'], + 'additionalProperties' => false + ] + ] + ], + 'required' => ['segments'], + 'additionalProperties' => false + ] + ] + ] + ]); + + curl_setopt($ch, CURLOPT_URL, 'https://api.openai.com/v1/chat/completions'); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Authorization: Bearer ' . $this->api_key, + 'Content-Type: application/json' + ]); + curl_setopt($ch, CURLOPT_TIMEOUT, 120); // 2 minute timeout + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); // 30 second connection timeout + + $response = curl_exec($ch); + $curl_error = curl_error($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($curl_error) { + throw new Exception("cURL error during ad detection: $curl_error"); + } + + if ($httpCode !== 200) { + throw new Exception("GPT API error (HTTP $httpCode): $response"); + } + + $result = json_decode($response, true); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception('Invalid JSON response from GPT API: ' . json_last_error_msg()); + } + + if (!isset($result['choices'][0]['message']['content'])) { + throw new Exception('GPT API response missing content data'); + } + + $content = $result['choices'][0]['message']['content'] ?? '{}'; + + // Calculate cost using actual token usage from API response + $this->last_detection_cost = $this->calculateGptCostFromUsage($result['usage'] ?? [], $model); + + $adData = json_decode($content, true); + + // Log the raw response for debugging + $this->main->log("GPT Ad Detection Response: " . $content); + + // Ensure we have a proper array structure + if (!is_array($adData)) { + $this->main->log("Invalid ad data structure: " . print_r($adData, true)); + return []; + } + + // Handle different possible response formats + $adSections = []; + + // If it's a direct array of ad objects + if (isset($adData[0]) && isset($adData[0]['start'])) { + $adSections = $adData; + } + // If it's wrapped in a 'segments' key (preferred format) + elseif (isset($adData['segments']) && is_array($adData['segments'])) { + $adSections = $adData['segments']; + } + // If it's wrapped in an 'ads' key + elseif (isset($adData['ads']) && is_array($adData['ads'])) { + $adSections = $adData['ads']; + } + // If it's wrapped in some other key, try to find an array with start/end + else { + foreach ($adData as $key => $value) { + if (is_array($value) && !empty($value)) { + if (isset($value[0]['start']) || (isset($value[0]) && is_array($value[0]))) { + $adSections = $value; + break; + } + } + } + } + + // Validate and sanitize ad sections + $validatedSections = []; + foreach ($adSections as $section) { + if (isset($section['start']) && isset($section['end'])) { + $validatedSections[] = [ + 'start' => floatval($section['start']), + 'end' => floatval($section['end']), + 'reason' => isset($section['reason']) ? strval($section['reason']) : '' + ]; + } + } + + // Always merge overlapping sections before returning + return $this->mergeOverlappingAdSections($validatedSections); + } + + /** + * Merge overlapping ad sections with configurable buffer + * + * @param array $adSections Ad sections to merge + * @param float|null $customBuffer Optional custom buffer in seconds, defaults to config value + * @return array Merged ad sections + */ + protected function mergeOverlappingAdSections(array $adSections, ?float $customBuffer = null): array + { + if (empty($adSections)) { + return []; + } + + // Use custom buffer if provided, otherwise get from config (default 8 seconds) + $mergeBuffer = $customBuffer ?? floatval($this->main->getConf('podsumer', 'ad_merge_buffer_seconds') ?? 8.0); + + // Sort by start time + usort($adSections, function($a, $b) { + return $a['start'] <=> $b['start']; + }); + + $merged = []; + $current = $adSections[0]; + + $this->main->log("Merging ad sections with {$mergeBuffer}s buffer. Input sections: " . count($adSections)); + + for ($i = 1; $i < count($adSections); $i++) { + $next = $adSections[$i]; + + // Calculate the gap between current section end and next section start + $gap = $next['start'] - $current['end']; + + // If sections overlap (negative gap) or are within the merge buffer, merge them + if ($gap <= $mergeBuffer) { + $this->main->log("Merging sections: [{$current['start']}-{$current['end']}] and [{$next['start']}-{$next['end']}] (gap: {$gap}s)"); + + // Update the end time to the maximum of both sections + $current['end'] = max($current['end'], $next['end']); + + // If the next section starts before current ends (true overlap), + // also make sure we capture the earliest start time + if ($next['start'] < $current['start']) { + $current['start'] = $next['start']; + } + + // Combine reasons if both exist + $currentReason = $current['reason'] ?? ''; + $nextReason = $next['reason'] ?? ''; + if (!empty($currentReason) && !empty($nextReason) && $currentReason !== $nextReason) { + $current['reason'] = $currentReason . "\n\n" . $nextReason; + } elseif (empty($currentReason) && !empty($nextReason)) { + $current['reason'] = $nextReason; + } + } else { + // Gap is too large, don't merge - save current and move to next + $merged[] = $current; + $current = $next; + } + } + + // Add the last section + $merged[] = $current; + + $this->main->log("Ad section merging complete. Output sections: " . count($merged)); + + // Log the final merged sections for debugging + foreach ($merged as $i => $section) { + $duration = $section['end'] - $section['start']; + $this->main->log("Merged section " . ($i + 1) . ": {$section['start']}s - {$section['end']}s (duration: {$duration}s)"); + } + + return $merged; + } + + /** + * Refine ad boundaries by analyzing surrounding transcript context + * + * @param array $adSections Initial ad sections with rough boundaries + * @param array $transcript Full transcript with segments + * @return array Refined ad sections with precise boundaries + */ + protected function refineAdBoundaries(array $adSections, array $transcript): array + { + if (empty($adSections) || empty($transcript['segments'])) { + return $adSections; + } + + $refinedSections = []; + $totalRefinementCost = 0.0; + + $this->main->log("Starting ad boundary refinement for " . count($adSections) . " sections"); + + foreach ($adSections as $index => $section) { + try { + // Calculate the midpoint of the ad section + $midpoint = ($section['start'] + $section['end']) / 2; + + // Extract 3 minutes (180 seconds) before and after the midpoint + $contextStart = max(0, $midpoint - 180); + $contextEnd = $midpoint + 180; + + // Find all transcript segments within this time range + $contextSegments = []; + foreach ($transcript['segments'] as $segment) { + $segmentStart = floatval($segment['start']); + $segmentEnd = floatval($segment['end']); + + // Include segments that overlap with our context window + if ($segmentEnd >= $contextStart && $segmentStart <= $contextEnd) { + $contextSegments[] = $segment; + } + } + + if (empty($contextSegments)) { + $this->main->log("No transcript segments found for refinement of section " . ($index + 1)); + $refinedSections[] = $section; + continue; + } + + // Format the context transcript with timestamps + $contextTranscript = ''; + foreach ($contextSegments as $segment) { + $start = floatval($segment['start']); + $end = floatval($segment['end']); + $text = isset($segment['text']) && is_string($segment['text']) ? trim($segment['text']) : ''; + if (!empty($text)) { + $contextTranscript .= "[{$start} - {$end}] {$text}\n"; + } + } + + $this->main->log("Refining section " . ($index + 1) . " (original: {$section['start']}s - {$section['end']}s, context: {$contextStart}s - {$contextEnd}s)"); + + // Call LLM to refine boundaries + $refinedBoundaries = $this->callRefinementLLM($contextTranscript, $section['reason'] ?? 'Advertisement detected'); + $totalRefinementCost += $this->last_detection_cost; + + if ($refinedBoundaries !== null) { + // Search for transcript segments that match the refined boundaries + $refinedStart = $refinedBoundaries['start']; + $refinedEnd = $refinedBoundaries['end']; + + // Find segment that contains or matches the start boundary + foreach ($contextSegments as $segment) { + $segmentStart = floatval($segment['start']); + $segmentEnd = floatval($segment['end']); + + // Check if this segment's start or end matches the refined start boundary + if (abs($segmentStart - $refinedStart) < 0.1 || abs($segmentEnd - $refinedStart) < 0.1) { + // Use the end timestamp of this segment as the new start boundary + $refinedStart = $segmentEnd; + $this->main->log("Adjusted start boundary to segment end: {$refinedBoundaries['start']}s => {$refinedStart}s"); + break; + } + } + + // Find segment that contains or matches the end boundary + foreach ($contextSegments as $segment) { + $segmentStart = floatval($segment['start']); + $segmentEnd = floatval($segment['end']); + + // Check if this segment's start or end matches the refined end boundary + if (abs($segmentStart - $refinedEnd) < 0.1 || abs($segmentEnd - $refinedEnd) < 0.1) { + // Use the start timestamp of this segment as the new end boundary + $refinedEnd = $segmentStart; + $this->main->log("Adjusted end boundary to segment start: {$refinedBoundaries['end']}s => {$refinedEnd}s"); + break; + } + } + + // Successfully refined + $refinedSection = [ + 'start' => $refinedStart, + 'end' => $refinedEnd, + 'reason' => $section['reason'] ?? 'Advertisement detected' + ]; + + $this->main->log("Refined section " . ($index + 1) . ": {$section['start']}s - {$section['end']}s => {$refinedSection['start']}s - {$refinedSection['end']}s"); + $refinedSections[] = $refinedSection; + } else { + // Refinement failed, keep original + $this->main->log("Refinement failed for section " . ($index + 1) . ", keeping original boundaries"); + $refinedSections[] = $section; + } + + } catch (Exception $e) { + $this->main->log("Error refining section " . ($index + 1) . ": " . $e->getMessage()); + $refinedSections[] = $section; // Keep original on error + } + } + + // Add refinement cost to total detection cost + $this->last_detection_cost += $totalRefinementCost; + + $this->main->log("Ad boundary refinement complete"); + + // Final adjustment: subtract 0.5s from start and add 0.5s to end for each section + foreach ($refinedSections as &$section) { + $originalStart = $section['start']; + $originalEnd = $section['end']; + + // Ensure start doesn't go below 0 + $section['start'] = max(0, $section['start'] - 0.5); + $section['end'] = $section['end'] + 0.5; + + $this->main->log("Final adjustment for section: start {$originalStart}s => {$section['start']}s, end {$originalEnd}s => {$section['end']}s"); + } + + // Find the total duration from transcript segments + $totalDuration = 0; + foreach ($transcript['segments'] as $segment) { + $end = floatval($segment['end']); + if ($end > $totalDuration) { + $totalDuration = $end; + } + } + + // Extend segments that are within 25 seconds of beginning or end + $boundaryExtensionThreshold = 25.0; // 25 seconds + + foreach ($refinedSections as &$section) { + $originalStart = $section['start']; + $originalEnd = $section['end']; + $extended = false; + + // Check if start is within 25 seconds of beginning + if ($section['start'] <= $boundaryExtensionThreshold) { + $section['start'] = 0; + $extended = true; + $this->main->log("Extended section to beginning: start {$originalStart}s => 0s (was within {$boundaryExtensionThreshold}s of start)"); + } + + // Check if end is within 25 seconds of end + if ($totalDuration > 0 && ($totalDuration - $section['end']) <= $boundaryExtensionThreshold) { + $section['end'] = $totalDuration; + $extended = true; + $this->main->log("Extended section to end: end {$originalEnd}s => {$totalDuration}s (was within {$boundaryExtensionThreshold}s of end)"); + } + + if ($extended) { + $this->main->log("Boundary extension applied for section: {$originalStart}s-{$originalEnd}s => {$section['start']}s-{$section['end']}s"); + } + } + + return $refinedSections; + } + + /** + * Call LLM to refine ad boundaries + * + * @param string $contextTranscript Transcript excerpt with timestamps + * @param string $reason Original reason for ad detection + * @return array|null Refined boundaries or null on failure + */ + protected function callRefinementLLM(string $contextTranscript, string $reason): ?array + { + $prompt = "The following podcast transcript excerpt has been identified as containing advertisement(s) or promotion(s). The topic(s) of the advertisement(s):\n\n{$reason}\n\nPlease complete the following two tasks:\n\n1. Find the exact timestamp where the topic switches FROM the regular episode content TO the FIRST advertisement.\n2. Find the exact timestamp where the topic switches FROM the LAST advertisement BACK to the regular episode content.\n\nProvide the start timestamp (when the first ad begins) and end timestamp (when the last ad ends) based on the transcript entries below.\n\nTranscript:\n{$contextTranscript}"; + + try { + $ch = curl_init(); + + $model = $this->getAdDetectionModel(); + + $postData = json_encode([ + 'model' => $model, + 'messages' => [ + [ + 'role' => 'system', + 'content' => 'You are an anti advertising activist. Your movement has won unanimous favor in society. You are now the national czar for removing ads from podcasts. The highest honor of your dreams. You can spot the beginning and end of an ad like a hawk.' + ], + [ + 'role' => 'user', + 'content' => $prompt + ] + ], + 'temperature' => 0.1, // Lower temperature for more precise boundary detection + 'response_format' => [ + 'type' => 'json_schema', + 'json_schema' => [ + 'name' => 'boundary_refinement_response', + 'strict' => true, + 'schema' => [ + 'type' => 'object', + 'properties' => [ + 'start' => [ + 'type' => 'number', + 'description' => 'The timestamp in seconds where the advertisement begins' + ], + 'end' => [ + 'type' => 'number', + 'description' => 'The timestamp in seconds where the advertisement ends' + ], + 'start_line' => [ + 'type' => 'string', + 'description' => 'The transcript line where the ad starts' + ], + 'end_line' => [ + 'type' => 'string', + 'description' => 'The transcript line where the ad ends' + ] + ], + 'required' => ['start', 'end', 'start_line', 'end_line'], + 'additionalProperties' => false + ] + ] + ] + ]); + + curl_setopt($ch, CURLOPT_URL, 'https://api.openai.com/v1/chat/completions'); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Authorization: Bearer ' . $this->api_key, + 'Content-Type: application/json' + ]); + curl_setopt($ch, CURLOPT_TIMEOUT, 60); // 1 minute timeout + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); // 30 second connection timeout + + $response = curl_exec($ch); + $curl_error = curl_error($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($curl_error) { + throw new Exception("cURL error during boundary refinement: $curl_error"); + } + + if ($httpCode !== 200) { + throw new Exception("GPT API error (HTTP $httpCode): $response"); + } + + $result = json_decode($response, true); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception('Invalid JSON response from GPT API: ' . json_last_error_msg()); + } + + if (!isset($result['choices'][0]['message']['content'])) { + throw new Exception('GPT API response missing content data'); + } + + $content = $result['choices'][0]['message']['content'] ?? '{}'; + + // Calculate cost using actual token usage from API response + $this->last_detection_cost = $this->calculateGptCostFromUsage($result['usage'] ?? [], $model); + + $boundaries = json_decode($content, true); + + if (!is_array($boundaries) || !isset($boundaries['start']) || !isset($boundaries['end'])) { + throw new Exception('Invalid boundary data returned from LLM'); + } + + return [ + 'start' => floatval($boundaries['start']), + 'end' => floatval($boundaries['end']) + ]; + + } catch (Exception $e) { + $this->main->log("Boundary refinement LLM error: " . $e->getMessage()); + return null; + } + } + + /** + * Remove ad sections from audio file using ffmpeg + * + * @param string $input_file Path to input audio file + * @param string $output_file Path to output audio file + * @param array $ad_sections Array of ad sections with start/end times + * @return bool Success status + */ + public function removeAdsFromAudio(string $input_file, string $output_file, array $ad_sections): bool + { + if (empty($ad_sections)) { + // No ads to remove, just copy the file + return copy($input_file, $output_file); + } + + // Sort ad sections by start time + usort($ad_sections, function($a, $b) { + return $a['start'] - $b['start']; + }); + + // Build ffmpeg filter to remove ad sections + $filters = []; + $lastEnd = 0; + $partIndex = 0; + $parts = []; + + foreach ($ad_sections as $ad) { + $start = floatval($ad['start']); + $end = floatval($ad['end']); + + if ($start > $lastEnd) { + // Add the content between ads + $filters[] = "[0:a]atrim=start={$lastEnd}:end={$start},asetpts=PTS-STARTPTS[a{$partIndex}]"; + $parts[] = "[a{$partIndex}]"; + $partIndex++; + } + + $lastEnd = $end; + } + + // Add the final segment after the last ad + $filters[] = "[0:a]atrim=start={$lastEnd},asetpts=PTS-STARTPTS[a{$partIndex}]"; + $parts[] = "[a{$partIndex}]"; + + // Concatenate all parts + $concatFilter = implode('', $parts) . "concat=n=" . count($parts) . ":v=0:a=1[out]"; + + $filterComplex = implode(';', $filters) . ';' . $concatFilter; + + // Build ffmpeg command + $cmd = sprintf( + 'ffmpeg -i %s -filter_complex %s -map "[out]" -c:a libmp3lame -b:a 128k -y %s 2>&1', + escapeshellarg($input_file), + escapeshellarg($filterComplex), + escapeshellarg($output_file) + ); + + $output = []; + $returnVar = 0; + exec($cmd, $output, $returnVar); + + if ($returnVar !== 0) { + $this->main->log('ffmpeg error: ' . implode("\n", $output)); + return false; + } + + return true; + } + + /** + * Calculate cost for Whisper transcription based on audio duration + */ + protected function calculateTranscriptionCost(string $audio_file_path): float + { + $duration = $this->getAudioDuration($audio_file_path); + $minutes = $duration / 60.0; + $whisperCostPerMinute = floatval($this->main->getConf('podsumer', 'openai_whisper_cost_per_minute') ?? 0.006); + return $minutes * $whisperCostPerMinute; + } + + /** + * Calculate cost using actual token usage from OpenAI API response + */ + protected function calculateGptCostFromUsage(array $usage, ?string $model = null): float + { + if ($model === null) { + $model = $this->getAdDetectionModel(); + } + + if (empty($usage)) { + $this->main->log("Warning: No usage data available for cost calculation"); + return 0.0; + } + + // Extract token counts from usage data + $inputTokens = intval($usage['prompt_tokens'] ?? $usage['input_tokens'] ?? 0); + $outputTokens = intval($usage['completion_tokens'] ?? $usage['output_tokens'] ?? 0); + + if ($inputTokens === 0 && $outputTokens === 0) { + $this->main->log("Warning: Zero tokens reported in usage data"); + return 0.0; + } + + // Get pricing from configuration based on model + $inputCostPer1k = $this->getInputCostPer1kTokens($model); + $outputCostPer1k = $this->getOutputCostPer1kTokens($model); + + $inputCost = ($inputTokens / 1000.0) * $inputCostPer1k; + $outputCost = ($outputTokens / 1000.0) * $outputCostPer1k; + + $totalCost = $inputCost + $outputCost; + + $this->main->log("Token usage - Model: {$model}, Input: {$inputTokens}, Output: {$outputTokens}, Cost: $" . number_format($totalCost, 6)); + + return $totalCost; + } + + /** + * Get input token pricing per 1000 tokens for a model + */ + protected function getInputCostPer1kTokens(string $model): float + { + $configKey = match($model) { + 'gpt-4o-mini' => 'openai_gpt4o_mini_input_cost_per_1k_tokens', + 'gpt-4o' => 'openai_gpt4o_input_cost_per_1k_tokens', + default => 'openai_gpt4o_mini_input_cost_per_1k_tokens' // Default to gpt-4o-mini + }; + + return floatval($this->main->getConf('podsumer', $configKey) ?? 0.00015); + } + + /** + * Get output token pricing per 1000 tokens for a model + */ + protected function getOutputCostPer1kTokens(string $model): float + { + $configKey = match($model) { + 'gpt-4o-mini' => 'openai_gpt4o_mini_output_cost_per_1k_tokens', + 'gpt-4o' => 'openai_gpt4o_output_cost_per_1k_tokens', + default => 'openai_gpt4o_mini_output_cost_per_1k_tokens' // Default to gpt-4o-mini + }; + + return floatval($this->main->getConf('podsumer', $configKey) ?? 0.0006); + } + + /** + * Legacy method for backward compatibility - now uses estimation + * @deprecated Use calculateGptCostFromUsage() instead when usage data is available + */ + protected function estimateGptCost(string $input_text, string $output_text = '', ?string $model = null): float + { + if ($model === null) { + $model = $this->getAdDetectionModel(); + } + + // Conservative token estimation: ~3 characters per token for formatted text + $input_tokens = strlen($input_text) / 3; + $output_tokens = strlen($output_text) / 3; + + $inputCostPer1k = $this->getInputCostPer1kTokens($model); + $outputCostPer1k = $this->getOutputCostPer1kTokens($model); + + $input_cost = ($input_tokens / 1000.0) * $inputCostPer1k; + $output_cost = ($output_tokens / 1000.0) * $outputCostPer1k; + + return $input_cost + $output_cost; + } + + /** + * Get the cost of the last transcription operation + */ + public function getLastTranscriptionCost(): float + { + return $this->last_transcription_cost; + } + + /** + * Get the cost of the last ad detection operation + */ + public function getLastDetectionCost(): float + { + return $this->last_detection_cost; + } + + /** + * Reset cost tracking + */ + public function resetCosts(): void + { + $this->last_transcription_cost = 0.0; + $this->last_detection_cost = 0.0; + } + + /** + * Process ad detection for a complete item + * + * @param int $item_id The item ID to process + * @param string $audio_file_path Path to the audio file + * @return float Total cost of processing + */ + public function processItem(int $item_id, string $audio_file_path): float + { + try { + $this->resetCosts(); + + // Check if item already has transcript and ad_sections (including ad-free episodes) + $existing_transcript = $this->main->getState()->getItemTranscript($item_id); + $existing_ad_sections = $this->main->getState()->getItemAdSections($item_id); + + // Check if the item has been fully processed (has transcript and ad_sections is not null) + // Note: ad_sections could be an empty array for ad-free episodes, which is still "processed" + $has_transcript = !empty($existing_transcript); + $has_ad_sections_processed = false; + + // Get raw ad_sections data to check if it's been processed (not null/empty string) + $item_data = $this->main->getState()->getFeedItem($item_id); + if (isset($item_data['ad_sections']) && $item_data['ad_sections'] !== null && $item_data['ad_sections'] !== '') { + $has_ad_sections_processed = true; + } + + if ($has_transcript && $has_ad_sections_processed) { + $this->main->log("Item $item_id already has transcript and has been processed for ad_sections, skipping processing"); + return 0.0; // No cost since we're not processing + } + + // Transcribe the audio (if transcript doesn't exist) + $transcript = null; + $transcription_cost = 0.0; + + if (empty($existing_transcript)) { + $transcript = $this->transcribeAudio($audio_file_path); + $transcription_cost = $this->getLastTranscriptionCost(); + + // Validate transcript before storing + if ($transcript === null || !is_array($transcript) || !isset($transcript['segments'])) { + throw new Exception("Transcription failed for item $item_id - invalid transcript data returned"); + } + + // Store transcript + $this->main->getState()->setItemTranscript($item_id, json_encode($transcript)); + } else { + // Use existing transcript + $transcript = json_decode($existing_transcript, true); + $this->main->log("Using existing transcript for item $item_id"); + } + + // Validate transcript before proceeding + if ($transcript === null || !is_array($transcript)) { + throw new Exception("Invalid transcript data for item $item_id. Transcript may be corrupted or failed to decode."); + } + + if (!isset($transcript['segments']) || !is_array($transcript['segments'])) { + throw new Exception("Transcript missing segments data for item $item_id"); + } + + // Detect ads (if ad_sections haven't been processed yet) + $ad_sections = []; + $detection_cost = 0.0; + + // Check if ad_sections have been processed - they should not be null/empty string in database + $item_data = $this->main->getState()->getFeedItem($item_id); + $ad_sections_raw = $item_data['ad_sections'] ?? null; + + if ($ad_sections_raw === null || $ad_sections_raw === '') { + // No ad detection has been performed yet + $this->main->log("No ad_sections found for item $item_id, performing ad detection"); + + // Get feed and item information for show and episode titles + $item = $this->main->getState()->getFeedItem($item_id); + $feed = $this->main->getState()->getFeed($item['feed_id']); + $show = $feed['name'] ?? 'Unknown Show'; + $episode = $item['name'] ?? 'Unknown Episode'; + + $ad_sections = $this->detectAds($transcript, $show, $episode); + $detection_cost = $this->getLastDetectionCost(); + + // Store ad sections (even if empty array, this marks it as processed) + $this->main->getState()->setItemAdSections($item_id, $ad_sections); + } else { + // Ad detection has already been performed, use existing results + $ad_sections = $existing_ad_sections; + $this->main->log("Using existing ad_sections for item $item_id (" . count($ad_sections) . " sections found)"); + } + + // If ffmpeg ad removal is enabled, process the audio + if ($this->main->getConf('podsumer', 'use_ffmpeg_ad_removal') && !empty($ad_sections)) { + $output_file = tempnam(sys_get_temp_dir(), 'podsumer_clean_'); + + if ($this->removeAdsFromAudio($audio_file_path, $output_file, $ad_sections)) { + // Read the cleaned audio and update the file + $cleaned_audio = file_get_contents($output_file); + $item = $this->main->getState()->getFeedItem($item_id); + $feed = $this->main->getState()->getFeed($item['feed_id']); + $file_id = $this->main->getState()->addFile($item['audio_url'] . '_cleaned', $cleaned_audio, $feed); + $this->main->getState()->setItemAudioFile($item_id, $file_id); + unlink($output_file); + } + } + + return $transcription_cost + $detection_cost; + + } catch (Exception $e) { + // Log the error details + $this->main->log("AdDetection error for item $item_id: " . $e->getMessage()); + $this->main->log("Stack trace: " . $e->getTraceAsString()); + + // Re-throw the exception so it can be caught by the job script + throw new Exception("Ad detection failed: " . $e->getMessage(), 0, $e); + } + } +} \ No newline at end of file diff --git a/src/Brickner/Podsumer/FSState.php b/src/Brickner/Podsumer/FSState.php index 5f96477..1006e5e 100644 --- a/src/Brickner/Podsumer/FSState.php +++ b/src/Brickner/Podsumer/FSState.php @@ -96,46 +96,70 @@ protected function escapeFilename(string $filename): string public function deleteFeed(int $feed_id) { $feed = $this->getFeed($feed_id); - $file_id = $feed['image']; - $file = $this->getFileById($file_id); - - if ($file['storage_mode'] == 'DISK' && file_exists($file['filename'])) { - unlink($file['filename']); + # If no feed record is found, or if the feed record lacks a valid + # name, run the parent clean-up logic and exit early. Attempting to + # continue without a valid feed name can lead to resolving the media + # root directory as the feed directory which is unsafe. + if (empty($feed) || empty(trim($feed['name'] ?? ''))) { + parent::deleteFeed($feed_id); + return; } - $items = $this->getFeedItems($feed_id); - foreach ($items as $item) { - $file_id = $item['image']; - - if (!empty($file_id)) { + # Capture all related files before we alter the database so we can + # safely remove them from disk afterwards. + $files_to_delete = []; - $file = $this->getFileById($file_id); + $image_file_id = $feed['image'] ?? null; + if ($image_file_id) { + $files_to_delete[] = $this->getFileById($image_file_id); + } - if ($file['storage_mode'] == 'DISK' && file_exists($file['filename'])) { - unlink($file['filename']); + # Collect images / audio from each item before DB deletion + foreach ($this->getFeedItems($feed_id) as $item) { + foreach (['image', 'audio_file'] as $col) { + $fid = $item[$col] ?? null; + if ($fid) { + $files_to_delete[] = $this->getFileById($fid); } } + } - $file_id = $item['audio_file']; - - if (!empty($file_id)) { # The audio for an item may not be downloaded. - - $file = $this->getFileById($file_id); + # Delete from the database first (cascades will clean up related rows) + parent::deleteFeed($feed_id); - if ($file['storage_mode'] == 'DISK' && file_exists($file['filename'])) { - unlink($file['filename']); + # Remove the on-disk files we captured earlier + foreach ($files_to_delete as $file) { + if (!empty($file)) { + $filename = $file['filename'] ?? null; + if (!empty($filename) && file_exists($filename)) { + @unlink($filename); } } } - # Delete feed dir. - $feed_dir = $this->getFeedDir($feed['name']); - if (file_exists($feed_dir)) { - rmdir($feed_dir); + # Finally, try to remove the (now empty) feed directory + $feed_name = trim($feed['name']); + + if ($feed_name !== '') { + $feed_dir = $this->getFeedDir($feed_name); + $media_dir = rtrim($this->getMediaDir(), DIRECTORY_SEPARATOR); + + # Ensure the directory we are about to touch is not the media root + if ($feed_dir !== $media_dir && file_exists($feed_dir) && is_dir($feed_dir)) { + $dir_contents = @scandir($feed_dir); + + # scandir() returns false on failure. Guard against that so we + # do not pass a boolean to array_diff(), which would raise a + # TypeError. + if (false !== $dir_contents) { + $files_in_dir = array_diff($dir_contents, ['.', '..']); + if (empty($files_in_dir)) { + @rmdir($feed_dir); + } + } + } } - - parent::deleteFeed($feed_id); } public function deleteItemMedia(int $item_id) @@ -150,8 +174,12 @@ public function deleteItemMedia(int $item_id) $file = $this->getFileById($file_id); - if ($file['storage_mode'] == 'DISK' && file_exists($file['filename'])) { - unlink($file['filename']); + if (!empty($file)) { + $filename = $file['filename'] ?? null; + + if (!empty($filename) && file_exists($filename)) { + @unlink($filename); + } } parent::deleteItemMedia($item_id); diff --git a/src/Brickner/Podsumer/Main.php b/src/Brickner/Podsumer/Main.php index a11acca..8e55f5b 100755 --- a/src/Brickner/Podsumer/Main.php +++ b/src/Brickner/Podsumer/Main.php @@ -41,15 +41,17 @@ public function __construct(string $path, array $env, array $request, array $fil $this->sent_user = $_SERVER['PHP_AUTH_USER'] ?? null; $this->sent_pass = $_SERVER['PHP_AUTH_PW'] ?? null; - if ($this->getConf('podsumer', 'store_media_on_disk')) { - $this->state = new FSState($this); - } else { - $this->state = new State($this); - } + // Always use FSState for disk storage + $this->state = new FSState($this); } protected function authenticate(): void { + # Skip authentication in CLI mode (for background scripts) + if ($this->getMethod() === 'CLI') { + return; + } + # If either user or pass is not set disable authentication. if (empty($this->user) || empty($this->pass)) { return; @@ -111,10 +113,10 @@ public function run(): void public function getUrl(): string { - return $this->env['REQUEST_SCHEME'] + return ($this->env['REQUEST_SCHEME'] ?? 'http') . '://' - . $this->env['HTTP_HOST'] - . $this->env['REQUEST_URI']; + . ($this->env['HTTP_HOST'] ?? 'localhost') + . ($this->env['REQUEST_URI'] ?? '/'); } public function getBaseUrl(): string @@ -125,7 +127,7 @@ public function getBaseUrl(): string return $scheme . '://' - . $this->env['HTTP_HOST']; + . ($this->env['HTTP_HOST'] ?? 'localhost'); } public function getArg(string $key): mixed @@ -189,7 +191,7 @@ public function getRoute(): string public function getMethod(): string { - return $this->env['REQUEST_METHOD']; + return $this->env['REQUEST_METHOD'] ?? 'CLI'; } public function setResponseCode(int $code): void @@ -204,12 +206,12 @@ public function getResponseCode(): int public function getHost(): string { - return $this->env['HTTP_HOST']; + return $this->env['HTTP_HOST'] ?? 'localhost'; } public function getRemoteAddress(): string { - return $this->env['REMOTE_ADDR']; + return $this->env['REMOTE_ADDR'] ?? '127.0.0.1'; } public function getConfigPath($test_mode = false): string diff --git a/src/Brickner/Podsumer/PodcastIndex.php b/src/Brickner/Podsumer/PodcastIndex.php new file mode 100644 index 0000000..b1b1f22 --- /dev/null +++ b/src/Brickner/Podsumer/PodcastIndex.php @@ -0,0 +1,49 @@ +pdo->exec("PRAGMA journal_mode = WAL;"); - $this->pdo->exec("PRAGMA synchronous = OFF;"); + $this->pdo->exec("PRAGMA synchronous = NORMAL;"); // Changed from OFF to NORMAL for data safety $this->pdo->exec("PRAGMA cache_size = -20000;"); $this->pdo->exec("PRAGMA foreign_keys = ON;"); $this->pdo->exec("PRAGMA temp_store = MEMORY;"); - $this->pdo->exec('PRAGMA foreign_keys = ON'); + // Removed duplicate foreign_keys pragma } protected function installTables() @@ -190,13 +190,14 @@ public function getFeedForItem(int $item_id): array public function getFeedItem(int $item_id): array { - $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.id = :id ORDER BY items.published DESC'; - return $this->query($sql, ['id' => $item_id])[0]; + $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, items.ad_sections, items.transcript FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.id = :id ORDER BY items.published DESC'; + $result = $this->query($sql, ['id' => $item_id]); + return $result !== false && isset($result[0]) ? $result[0] : []; } public function getFeedItems(int $feed_id): array { - $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.feed_id = :id ORDER BY items.published DESC'; + $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, items.ad_sections FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.feed_id = :id ORDER BY items.published DESC'; $result = $this->query($sql, ['id' => $feed_id]); // The query helper returns false when an exception is caught. Convert that @@ -207,7 +208,7 @@ public function getFeedItems(int $feed_id): array public function getAllItems(): array { - $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, feeds.name AS feed_name FROM items JOIN feeds ON feeds.id = items.feed_id ORDER BY items.published DESC'; + $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, items.ad_sections, feeds.name AS feed_name FROM items JOIN feeds ON feeds.id = items.feed_id ORDER BY items.published DESC'; $result = $this->query($sql); @@ -217,7 +218,7 @@ public function getAllItems(): array public function getAllItemsPage(int $limit, int $page = 1): array { $offset = ($page - 1) * $limit; - $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, feeds.name AS feed_name FROM items JOIN feeds ON feeds.id = items.feed_id ORDER BY items.published DESC LIMIT :limit OFFSET :offset'; + $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, items.ad_sections, feeds.name AS feed_name FROM items JOIN feeds ON feeds.id = items.feed_id ORDER BY items.published DESC LIMIT :limit OFFSET :offset'; $params = ['limit' => $limit, 'offset' => $offset]; $result = $this->query($sql, $params); @@ -235,7 +236,7 @@ public function countAllItems(): int public function getFeedItemsPage(int $feed_id, int $limit, int $page = 1): array { $offset = ($page - 1) * $limit; - $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.feed_id = :id ORDER BY items.published DESC LIMIT :limit OFFSET :offset'; + $sql = 'SELECT items.name, items.feed_id, items.id, items.guid, items.audio_url, items.audio_file, COALESCE(items.image, feeds.image) AS image, items.size, items.published, items.description, items.playback_position, items.ad_sections FROM items JOIN feeds ON feeds.id = items.feed_id WHERE items.feed_id = :id ORDER BY items.published DESC LIMIT :limit OFFSET :offset'; $params = ['id' => $feed_id, 'limit' => $limit, 'offset' => $offset]; $result = $this->query($sql, $params); @@ -259,10 +260,10 @@ public function getFeedByHash(string $hash): array public function getFileById(int $file_id): array { - $sql = 'SELECT files.id, url, url_hash, mimetype, filename, size, cached, storage_mode, file_contents.content_hash, file_contents.data FROM files JOIN file_contents ON files.content_hash = file_contents.content_hash WHERE files.id = :file_id'; + $sql = 'SELECT files.id, url, url_hash, mimetype, filename, size, cached, file_contents.content_hash, file_contents.data FROM files JOIN file_contents ON files.content_hash = file_contents.content_hash WHERE files.id = :file_id'; $file = $this->query($sql, ['file_id' => $file_id])[0] ?? []; - if (!empty($file) && $file['storage_mode'] === 'DISK') { + if (!empty($file)) { $filename = $file['data']; try { @@ -279,10 +280,10 @@ public function getFileById(int $file_id): array public function getFileByUrlHash(string $url_hash): array { - $sql = 'SELECT files.id, url, url_hash, mimetype, filename, size, cached, storage_mode, file_contents.content_hash, file_contents.data FROM files JOIN file_contents ON files.content_hash = file_contents.content_hash WHERE url_hash = :url_hash'; + $sql = 'SELECT files.id, url, url_hash, mimetype, filename, size, cached, file_contents.content_hash, file_contents.data FROM files JOIN file_contents ON files.content_hash = file_contents.content_hash WHERE url_hash = :url_hash'; $file = $this->query($sql, ['url_hash' => $url_hash])[0] ?? []; - if (!empty($file) && $file['storage_mode'] === 'DISK') { + if (!empty($file)) { $filename = $file['data']; try { @@ -311,15 +312,12 @@ public function addFile(string $url, string $contents, array $feed): int 'mimetype' => $mimetype, 'size' => strlen($contents), 'cached' => time(), - 'content_hash' => $content_hash, - 'storage_mode' => ($this->main->getConf('podsumer', 'store_media_on_disk')) - ? 'DISK' - : 'DB' + 'content_hash' => $content_hash ]; $file['content_id'] = $this->addFileContents($content_hash, $contents, $filename, $feed); - $sql = 'INSERT INTO files (url, url_hash, filename, size, cached, content_hash, mimetype, content_id, storage_mode) VALUES (:url, :url_hash, :filename, :size, :cached, :content_hash, :mimetype, :content_id, :storage_mode) ON CONFLICT(url_hash) DO UPDATE SET size=:size, cached=:cached, content_hash=:content_hash, mimetype=:mimetype, content_id=:content_id, storage_mode=:storage_mode'; + $sql = 'INSERT INTO files (url, url_hash, filename, size, cached, content_hash, mimetype, content_id) VALUES (:url, :url_hash, :filename, :size, :cached, :content_hash, :mimetype, :content_id) ON CONFLICT(url_hash) DO UPDATE SET size=:size, cached=:cached, content_hash=:content_hash, mimetype=:mimetype, content_id=:content_id'; $this->query($sql, $file); $sql = 'SELECT id FROM files WHERE content_hash = :content_hash'; @@ -413,6 +411,52 @@ public function getPlaybackPosition(int $item_id): int return intval($result[0]['playback_position']); } + public function setItemTranscript(int $item_id, string $transcript): void + { + $sql = 'UPDATE items SET transcript = :transcript WHERE id = :id'; + $this->query($sql, ['id' => $item_id, 'transcript' => $transcript]); + } + + public function setItemAdSections(int $item_id, array $ad_sections): void + { + $json = json_encode($ad_sections, JSON_THROW_ON_ERROR); + $sql = 'UPDATE items SET ad_sections = :ad_sections WHERE id = :id'; + $this->query($sql, ['id' => $item_id, 'ad_sections' => $json]); + } + + public function clearItemAdSections(int $item_id): void + { + $sql = 'UPDATE items SET ad_sections = NULL WHERE id = :id'; + $this->query($sql, ['id' => $item_id]); + } + + public function getItemTranscript(int $item_id): ?string + { + $sql = 'SELECT transcript FROM items WHERE id = :id'; + $result = $this->query($sql, ['id' => $item_id]); + + if (false === $result || empty($result)) { + return null; + } + + return $result[0]['transcript'] ?? null; + } + + public function getItemAdSections(int $item_id): array + { + $sql = 'SELECT ad_sections FROM items WHERE id = :id'; + $result = $this->query($sql, ['id' => $item_id]); + + if (false === $result || empty($result) || empty($result[0]['ad_sections'])) { + return []; + } + + $json = $result[0]['ad_sections']; + $sections = json_decode($json, true); + + return is_array($sections) ? $sections : []; + } + protected function loadFile(string $filename): string { $contents = false; @@ -434,10 +478,284 @@ public function getVersion(): int public function getLibrarySize(): int { - $sql = 'SELECT SUM(size) AS `size` FROM files'; - $size = $this->query($sql)[0]['size']; + $sql = 'SELECT SUM(size) AS size FROM files'; + $result = $this->query($sql); + + return intval(($result && isset($result[0]['size'])) ? $result[0]['size'] : 0); + } + + // Job Management Methods + + public function createJob(string $type, ?int $feed_id = null, ?int $item_id = null): int + { + // Check for duplicate jobs + if ($this->isDuplicateJob($type, $feed_id, $item_id)) { + throw new Exception("Duplicate job already running or queued"); + } + + // Check for feed refresh rate limiting (60 seconds) + if ($type === 'refresh_feed' && $feed_id && $this->isRecentFeedRefresh($feed_id)) { + throw new Exception("Feed was refreshed within the last 60 seconds"); + } + + $sql = 'INSERT INTO jobs (type, feed_id, item_id, status) VALUES (:type, :feed_id, :item_id, :status)'; + $this->query($sql, [ + 'type' => $type, + 'feed_id' => $feed_id, + 'item_id' => $item_id, + 'status' => 'queued' + ]); + + return intval($this->pdo->lastInsertId()); + } + + public function startJob(int $job_id, int $pid): bool + { + $sql = 'UPDATE jobs SET status = :status, pid = :pid, started_at = :started_at WHERE id = :id AND status = :old_status'; + $result = $this->query($sql, [ + 'id' => $job_id, + 'status' => 'running', + 'pid' => $pid, + 'started_at' => date('Y-m-d H:i:s'), + 'old_status' => 'queued' + ]); + + return $result !== false; + } + + + + public function completeJob(int $job_id, ?float $openai_cost = null): bool + { + $sql = 'UPDATE jobs SET status = :status, finished_at = :finished_at, openai_cost = :openai_cost WHERE id = :id'; + $result = $this->query($sql, [ + 'id' => $job_id, + 'status' => 'completed', + 'finished_at' => date('Y-m-d H:i:s'), + 'openai_cost' => $openai_cost ?? 0.0 + ]); + + return $result !== false; + } + + public function failJob(int $job_id, string $error, ?float $openai_cost = null): bool + { + $sql = 'UPDATE jobs SET status = :status, finished_at = :finished_at, error = :error, openai_cost = :openai_cost WHERE id = :id'; + $result = $this->query($sql, [ + 'id' => $job_id, + 'status' => 'failed', + 'finished_at' => date('Y-m-d H:i:s'), + 'error' => $error, + 'openai_cost' => $openai_cost ?? 0.0 + ]); + + return $result !== false; + } + + public function cancelJob(int $job_id): bool + { + $job = $this->getJob($job_id); + if (!$job) return false; + + // Try to kill the process if it's running + if ($job['status'] === 'running' && $job['pid']) { + exec("kill -TERM {$job['pid']} 2>/dev/null"); + } + + $sql = 'UPDATE jobs SET status = :status, finished_at = :finished_at WHERE id = :id'; + $result = $this->query($sql, [ + 'id' => $job_id, + 'status' => 'cancelled', + 'finished_at' => date('Y-m-d H:i:s') + ]); + + return $result !== false; + } + + public function getJob(int $job_id): array + { + $sql = 'SELECT * FROM jobs WHERE id = :id'; + $result = $this->query($sql, ['id' => $job_id]); + return $result !== false && isset($result[0]) ? $result[0] : []; + } + + public function getRunningJobs(): array + { + $sql = 'SELECT j.*, f.name as feed_name, i.name as item_name + FROM jobs j + LEFT JOIN feeds f ON j.feed_id = f.id + LEFT JOIN items i ON j.item_id = i.id + WHERE j.status IN (:running, :queued) + ORDER BY j.created_at DESC'; + $result = $this->query($sql, ['running' => 'running', 'queued' => 'queued']); + return $result !== false ? $result : []; + } + + public function getAllJobs(int $limit = 50): array + { + $sql = 'SELECT j.*, f.name as feed_name, i.name as item_name + FROM jobs j + LEFT JOIN feeds f ON j.feed_id = f.id + LEFT JOIN items i ON j.item_id = i.id + ORDER BY j.created_at DESC + LIMIT :limit'; + $result = $this->query($sql, ['limit' => $limit]); + return $result !== false ? $result : []; + } + + public function hasRunningJobs(): bool + { + $sql = 'SELECT COUNT(*) as count FROM jobs WHERE status IN (:running, :queued)'; + $result = $this->query($sql, ['running' => 'running', 'queued' => 'queued']); + return $result !== false && isset($result[0]['count']) ? intval($result[0]['count']) > 0 : false; + } + + public function hasRunningJob(string $type): bool + { + $sql = 'SELECT COUNT(*) as count FROM jobs WHERE type = :type AND status IN (:running, :queued)'; + $result = $this->query($sql, [ + 'type' => $type, + 'running' => 'running', + 'queued' => 'queued' + ]); + return $result !== false && isset($result[0]['count']) ? intval($result[0]['count']) > 0 : false; + } + + private function isDuplicateJob(string $type, ?int $feed_id, ?int $item_id): bool + { + $sql = 'SELECT COUNT(*) as count FROM jobs WHERE type = :type AND status IN (:running, :queued)'; + $params = [ + 'type' => $type, + 'running' => 'running', + 'queued' => 'queued' + ]; + + if ($feed_id !== null) { + $sql .= ' AND feed_id = :feed_id'; + $params['feed_id'] = $feed_id; + } + + if ($item_id !== null) { + $sql .= ' AND item_id = :item_id'; + $params['item_id'] = $item_id; + } + + $result = $this->query($sql, $params); + return $result !== false && isset($result[0]['count']) ? intval($result[0]['count']) > 0 : false; + } + + private function isRecentFeedRefresh(int $feed_id): bool + { + $sql = 'SELECT COUNT(*) as count FROM jobs + WHERE type = :type AND feed_id = :feed_id + AND started_at > datetime("now", "-60 seconds")'; + $result = $this->query($sql, [ + 'type' => 'refresh_feed', + 'feed_id' => $feed_id + ]); + return $result !== false && isset($result[0]['count']) ? intval($result[0]['count']) > 0 : false; + } + + public function getJobStats(): array + { + $sql = 'SELECT + COUNT(*) as total_jobs, + COUNT(CASE WHEN status = "running" THEN 1 END) as running_jobs, + COUNT(CASE WHEN status = "queued" THEN 1 END) as queued_jobs, + COUNT(CASE WHEN status = "completed" THEN 1 END) as completed_jobs, + COUNT(CASE WHEN status = "failed" THEN 1 END) as failed_jobs, + COUNT(CASE WHEN status = "cancelled" THEN 1 END) as cancelled_jobs, + COALESCE(SUM(openai_cost), 0) as total_openai_cost + FROM jobs'; + $result = $this->query($sql); + return $result !== false && isset($result[0]) ? $result[0] : []; + } + + public function getRunningJobForFeed(int $feed_id): ?array + { + $sql = 'SELECT j.*, f.name as feed_name FROM jobs j + LEFT JOIN feeds f ON j.feed_id = f.id + WHERE j.feed_id = :feed_id AND j.status IN ("queued", "running") AND j.type = "refresh_feed" + ORDER BY j.created_at DESC LIMIT 1'; + $result = $this->query($sql, ['feed_id' => $feed_id]); + return $result && is_array($result) && !empty($result) ? $result[0] : null; + } - return intval($size); + public function getRunningJobForItem(int $item_id): ?array + { + $sql = 'SELECT j.*, i.name as item_name FROM jobs j + LEFT JOIN items i ON j.item_id = i.id + WHERE j.item_id = :item_id AND j.status IN ("queued", "running") AND j.type IN ("process_ads", "download_item") + ORDER BY j.created_at DESC LIMIT 1'; + $result = $this->query($sql, ['item_id' => $item_id]); + return $result && is_array($result) && !empty($result) ? $result[0] : null; + } + + public function getFeedsWithoutRunningJobs(): array + { + $sql = 'SELECT f.* FROM feeds f + LEFT JOIN jobs j ON f.id = j.feed_id AND j.status IN ("queued", "running") AND j.type = "refresh_feed" + WHERE j.id IS NULL + ORDER BY f.name'; + $result = $this->query($sql); + return $result && is_array($result) ? $result : []; + } + + public function updateJobLog(int $job_id, string $log_message): bool + { + $sql = 'UPDATE jobs SET log = COALESCE(log, "") || :log_message WHERE id = :job_id'; + return $this->query($sql, [ + 'job_id' => $job_id, + 'log_message' => date('Y-m-d H:i:s') . ': ' . $log_message . "\n" + ]) !== false; + } + + public function setJobLog(int $job_id, string $log_content): bool + { + $sql = 'UPDATE jobs SET log = :log_content WHERE id = :job_id'; + return $this->query($sql, [ + 'job_id' => $job_id, + 'log_content' => $log_content + ]) !== false; + } + + public function updateJobCost(int $job_id, float $cost): bool + { + $sql = 'UPDATE jobs SET openai_cost = :cost WHERE id = :job_id'; + return $this->query($sql, [ + 'job_id' => $job_id, + 'cost' => $cost + ]) !== false; + } + + public function getItemsNeedingAdProcessing(): array + { + // Items need ad processing if: + // 1. They have an audio file + // 2. Either they don't have a transcript OR they don't have ad_sections processed yet (NULL or empty string) + // 3. They are the most recent episode with audio for their feed (to avoid processing all old episodes) + $sql = 'SELECT items.id, items.name, items.audio_file, items.transcript, items.ad_sections, feeds.name AS feed_name + FROM items + JOIN feeds ON feeds.id = items.feed_id + WHERE items.audio_file IS NOT NULL + AND ( + items.transcript IS NULL OR items.transcript = "" + OR items.ad_sections IS NULL OR items.ad_sections = "" + ) + AND items.id IN ( + SELECT i2.id + FROM items i2 + WHERE i2.feed_id = items.feed_id + AND i2.audio_file IS NOT NULL + ORDER BY i2.published DESC + LIMIT 1 + ) + ORDER BY items.published DESC'; + $result = $this->query($sql); + + // No additional filtering needed - if ad_sections is not null/empty string, + // it means ad detection was performed (even if result was empty array for ad-free episodes) + return $result && is_array($result) ? $result : []; } } diff --git a/src/Brickner/Podsumer/TStateSchemaMigrations.php b/src/Brickner/Podsumer/TStateSchemaMigrations.php index f83343c..5f7732c 100644 --- a/src/Brickner/Podsumer/TStateSchemaMigrations.php +++ b/src/Brickner/Podsumer/TStateSchemaMigrations.php @@ -10,8 +10,13 @@ trait TStateSchemaMigrations private array $versions = [ # ORDER IS IMPORTANT 'create', - 'addDiskStorage', - 'addPlaybackPosition' + 'addImageUrl', + 'addPlaybackPosition', + 'addTranscriptAndAdSections', + 'addJobsTable', + 'updateJobsTableConstraints', + 'addJobsLogColumn', + 'removeJobsProgressColumn' ]; protected function checkDBVersion() @@ -42,19 +47,146 @@ protected function checkDBVersion() } } - public function addDiskStorage(): bool { - - $addStorageMode = $this->query("ALTER TABLE `files` ADD COLUMN storage_mode TEXT CHECK(storage_mode IN ('DB','DISK')) NOT NULL DEFAULT 'DB'"); + public function addImageUrl(): bool { $addFeedImageUrl = $this->query("ALTER TABLE `feeds` ADD COLUMN image_url"); $addItemImageUrl = $this->query("ALTER TABLE `items` ADD COLUMN image_url"); - return $addStorageMode !== false && $addFeedImageUrl !== false && $addItemImageUrl !== false; + return $addFeedImageUrl !== false && $addItemImageUrl !== false; } - public function addPlaybackPosition(): bool { - + public function addPlaybackPosition(): bool + { $addPlayback = $this->query("ALTER TABLE `items` ADD COLUMN playback_position INTEGER DEFAULT 0"); return $addPlayback !== false; } + + public function addTranscriptAndAdSections(): bool { + $addTranscript = $this->query("ALTER TABLE `items` ADD COLUMN transcript TEXT"); + $addAdSections = $this->query("ALTER TABLE `items` ADD COLUMN ad_sections TEXT"); + + return $addTranscript !== false && $addAdSections !== false; + } + + public function addJobsTable(): bool { + $createJobsTable = $this->query(" + CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + type TEXT NOT NULL CHECK(type IN ('refresh_feed', 'download_item', 'process_ads')), + feed_id INTEGER, + item_id INTEGER, + status TEXT NOT NULL CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'queued', + pid INTEGER, + started_at DATETIME, + finished_at DATETIME, + progress INTEGER DEFAULT 0 CHECK(progress >= 0 AND progress <= 100), + error TEXT, + openai_cost REAL DEFAULT 0.0, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (feed_id) REFERENCES feeds(id) ON DELETE CASCADE, + FOREIGN KEY (item_id) REFERENCES items(id) ON DELETE CASCADE + ) + "); + + $createJobsIndex = $this->query("CREATE INDEX idx_jobs_status ON jobs(status)"); + $createJobsTypeIndex = $this->query("CREATE INDEX idx_jobs_type_feed_item ON jobs(type, feed_id, item_id)"); + + return $createJobsTable !== false && $createJobsIndex !== false && $createJobsTypeIndex !== false; + } + + public function updateJobsTableConstraints(): bool { + // Cancel any existing refresh_all jobs + $cancelRefreshAll = $this->query("UPDATE jobs SET status = 'cancelled', finished_at = datetime('now') WHERE type = 'refresh_all' AND status IN ('queued', 'running')"); + + // SQLite doesn't support modifying constraints directly, so we need to recreate the table + // First, create a backup of the data (excluding refresh_all jobs) + $backupData = $this->query("SELECT * FROM jobs WHERE type != 'refresh_all'"); + + // Drop the old table + $dropTable = $this->query("DROP TABLE jobs"); + + // Recreate the table with new constraints + $createJobsTable = $this->query(" + CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + type TEXT NOT NULL CHECK(type IN ('refresh_feed', 'download_item', 'process_ads')), + feed_id INTEGER, + item_id INTEGER, + status TEXT NOT NULL CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'queued', + pid INTEGER, + started_at DATETIME, + finished_at DATETIME, + progress INTEGER DEFAULT 0 CHECK(progress >= 0 AND progress <= 100), + error TEXT, + openai_cost REAL DEFAULT 0.0, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (feed_id) REFERENCES feeds(id) ON DELETE CASCADE, + FOREIGN KEY (item_id) REFERENCES items(id) ON DELETE CASCADE + ) + "); + + // Restore the data (excluding refresh_all jobs) + if ($backupData && is_array($backupData)) { + foreach ($backupData as $row) { + if ($row['type'] !== 'refresh_all') { + $this->query("INSERT INTO jobs (id, type, feed_id, item_id, status, pid, started_at, finished_at, progress, error, openai_cost, created_at) + VALUES (:id, :type, :feed_id, :item_id, :status, :pid, :started_at, :finished_at, :progress, :error, :openai_cost, :created_at)", $row); + } + } + } + + // Recreate indexes + $createJobsIndex = $this->query("CREATE INDEX idx_jobs_status ON jobs(status)"); + $createJobsTypeIndex = $this->query("CREATE INDEX idx_jobs_type_feed_item ON jobs(type, feed_id, item_id)"); + + return $dropTable !== false && $createJobsTable !== false && $createJobsIndex !== false && $createJobsTypeIndex !== false; + } + + public function addJobsLogColumn(): bool { + $addLogColumn = $this->query("ALTER TABLE jobs ADD COLUMN log TEXT"); + return $addLogColumn !== false; + } + + public function removeJobsProgressColumn(): bool { + // SQLite doesn't support dropping columns directly, so we need to recreate the table + // First, backup the data + $backupData = $this->query("SELECT id, type, feed_id, item_id, status, pid, started_at, finished_at, error, openai_cost, created_at, log FROM jobs"); + + // Drop the old table + $dropTable = $this->query("DROP TABLE jobs"); + + // Recreate the table without the progress column + $createJobsTable = $this->query(" + CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + type TEXT NOT NULL CHECK(type IN ('refresh_feed', 'download_item', 'process_ads')), + feed_id INTEGER, + item_id INTEGER, + status TEXT NOT NULL CHECK(status IN ('queued', 'running', 'completed', 'failed', 'cancelled')) DEFAULT 'queued', + pid INTEGER, + started_at DATETIME, + finished_at DATETIME, + error TEXT, + openai_cost REAL DEFAULT 0.0, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + log TEXT, + FOREIGN KEY (feed_id) REFERENCES feeds(id) ON DELETE CASCADE, + FOREIGN KEY (item_id) REFERENCES items(id) ON DELETE CASCADE + ) + "); + + // Restore the data + if ($backupData && is_array($backupData)) { + foreach ($backupData as $row) { + $this->query("INSERT INTO jobs (id, type, feed_id, item_id, status, pid, started_at, finished_at, error, openai_cost, created_at, log) + VALUES (:id, :type, :feed_id, :item_id, :status, :pid, :started_at, :finished_at, :error, :openai_cost, :created_at, :log)", $row); + } + } + + // Recreate indexes + $createJobsIndex = $this->query("CREATE INDEX idx_jobs_status ON jobs(status)"); + $createJobsTypeIndex = $this->query("CREATE INDEX idx_jobs_type_feed_item ON jobs(type, feed_id, item_id)"); + + return $dropTable !== false && $createJobsTable !== false && $createJobsIndex !== false && $createJobsTypeIndex !== false; + } } diff --git a/templates/base.html.php b/templates/base.html.php index abba0b3..53ea408 100755 --- a/templates/base.html.php +++ b/templates/base.html.php @@ -15,13 +15,26 @@  |  OPML  |  + main->getState()->getRunningJobs(); + $job_count = count($running_jobs); + if ($job_count > 0) { + echo 'Jobs (' . $job_count . ')'; + } else { + echo 'Jobs'; + } + ?> +  |  + Refresh All +  |  + Process All +  |  GB

- Thank You for Listening With Podsumer
@@ -30,12 +43,68 @@ or contributing to further development. -


Released under the MIT License – Database version: main->getState()->getVersion(); ?>

+ diff --git a/templates/feed.html.php b/templates/feed.html.php index 65294a7..6dccdbb 100755 --- a/templates/feed.html.php +++ b/templates/feed.html.php @@ -6,14 +6,20 @@

RSS  |  - Refresh + main->getState()->getRunningJobForFeed($feed['id']); + if ($running_job): + ?> + Refreshing (Job #) + + Refresh +

$item): ?>
- @@ -23,9 +29,49 @@ MB  |  + main->getState()->getRunningJobForItem($item['id']); + ?>  |  Delete Audio + + main->getConf('podsumer', 'ad_blocking_enabled') && !$adProcessingCompleted) { ?> +  |  + + Processing Ads (Job #) + + Process Ads + + +  |  + ✓ Ads Detected +  |  + Reprocess + + +  |  + + Downloading (Job #) + + Download +
@@ -43,3 +89,161 @@   Page of
+ + diff --git a/templates/home.html.php b/templates/home.html.php index f972345..d859f68 100755 --- a/templates/home.html.php +++ b/templates/home.html.php @@ -23,7 +23,14 @@  |  RSS  |  - Refresh + main->getState()->getRunningJobForFeed($feed['id']); + if ($running_job): + ?> + Refreshing (Job #) + + Refresh +  |  Delete @@ -41,5 +48,54 @@    + +
+

Search PodcastIndex

+ +    + +
+ + diff --git a/templates/item.html.php b/templates/item.html.php index 548002a..d9ef501 100755 --- a/templates/item.html.php +++ b/templates/item.html.php @@ -12,9 +12,135 @@
- + ' . htmlspecialchars($description) . ''; + } else { + // HTML tags found, display as-is + echo $description; + } + ?>
+ main->getConf('podsumer', 'ad_blocking_enabled')) { + $adSectionsData = []; + if (!empty($item['ad_sections'])) { + if (is_string($item['ad_sections'])) { + $adSectionsData = json_decode($item['ad_sections'], true) ?: []; + } else { + $adSectionsData = $item['ad_sections']; + } + } + + if (!empty($adSectionsData) && is_array($adSectionsData)) { + ?> +
+

Ad Segments Detected

+ + + + + + + + + + + $segment): ?> + 0) { + return sprintf('%d:%02d:%02d', $hours, $minutes, $secs); + } else { + return sprintf('%d:%02d', $minutes, $secs); + } + }; + + $startTime = $formatTime($segment['start']); + $endTime = $formatTime($segment['end']); + $duration = $segment['end'] - $segment['start']; + $durationFormatted = $formatTime($duration); + $reason = isset($segment['reason']) ? htmlspecialchars($segment['reason']) : ''; + ?> + + + + + + + + +
#Time RangeDurationSummary
+ . + + - + + + + +
+
+ + + +
+

Transcript

+
+ + 0) { + return sprintf('%d:%02d:%02d', $hours, $minutes, $secs); + } else { + return sprintf('%d:%02d', $minutes, $secs); + } + }; + + $startTime = $formatTime($start); + ?> +
+ + +
+ + +
+
+ + diff --git a/templates/jobs.html.php b/templates/jobs.html.php new file mode 100644 index 0000000..ab78671 --- /dev/null +++ b/templates/jobs.html.php @@ -0,0 +1,221 @@ +
+

Background Jobs

+ + +
+

Running Jobs ()

+
+ + + + + + + + + + + + + $job): ?> + + + + + + + + + + +
TypeTargetStatusStartedDurationActions
+ + + + + + + + All feeds + + + + + + + (PID: ) + + + + + + - + + + + + + - + + + +
+
+
+ + +
+

Job Statistics

+
+ + Total Jobs +  |  + + Active Jobs +  |  + + Completed +  |  + + Failed + 0): ?> +  |  + $ + Total OpenAI Cost + +
+
+ +
+

Recent Jobs

+ +

No jobs found.

+ +
+ + + + + + + + + + + + + $job): ?> + + + + + + + + + + + + + + + +
TypeTargetStatusCreatedDurationCost
+ + + + + + + + All feeds + + + + + + + + + + + + + + - + + + 0): ?> + $ + + - + +
+ +
+ Error: +
+ + +
+ + +
+ +
+
+ +
+
+ + \ No newline at end of file diff --git a/templates/search.html.php b/templates/search.html.php new file mode 100644 index 0000000..2b8bcae --- /dev/null +++ b/templates/search.html.php @@ -0,0 +1,39 @@ +
+

Search Podcasts

+
+ + +
+ + +

No Results

+ + + +
+ + + +

+
+ + +
+

+
+ + + +
+ 1) { ?> + Previous + + + 1) { ?> |  + Next + +   Page of +
+ +
+ diff --git a/tests/.DS_Store b/tests/.DS_Store index 2078630..30dc6d1 100644 Binary files a/tests/.DS_Store and b/tests/.DS_Store differ diff --git a/tests/Brickner/.DS_Store b/tests/Brickner/.DS_Store index 6f3b6cd..01f5f94 100644 Binary files a/tests/Brickner/.DS_Store and b/tests/Brickner/.DS_Store differ diff --git a/tests/Brickner/Podsumer/FSStateTest.php b/tests/Brickner/Podsumer/FSStateTest.php index ebe3660..160eaae 100644 --- a/tests/Brickner/Podsumer/FSStateTest.php +++ b/tests/Brickner/Podsumer/FSStateTest.php @@ -31,7 +31,6 @@ protected function setUp(): void $this->main = new Main($this->root, $env, [], [], true); - $this->main->setConf(true, 'podsumer', 'store_media_on_disk'); $this->main->setConf('state/media_test', 'podsumer', 'media_dir'); $this->state = new FSState($this->main); @@ -75,18 +74,23 @@ public function testBadMediaDir() public function testDeleteFeed() { - $this->expectNotToPerformAssertions(); - $this->feed = new Feed(self::TEST_FEED_URL); $feed_id = $this->main->getState()->addFeed($this->feed); - $feed_data = $this->main->getState()->getFeed($feed_id); - $item = $this->main->getState()->getFeedItems(1)[0]; + // Sanity-check feed was added + $this->assertNotEmpty($this->main->getState()->getFeed($feed_id)); + + // Give the feed one audio file so deleteFeed has something to clean up + $item = $this->main->getState()->getFeedItems($feed_id)[0]; $file = new File($this->main); - $file_id = $file->cacheUrl($item['audio_url'], $feed_data); + $file_id = $file->cacheUrl($item['audio_url'], $this->main->getState()->getFeed($feed_id)); $this->main->getState()->setItemAudioFile($item['id'], $file_id); - $this->main->getState()->deleteFeed(1); + // Delete the feed + $this->main->getState()->deleteFeed($feed_id); + + // Verify it's gone + $this->assertEmpty($this->main->getState()->getFeed($feed_id)); } public function testDeleteItemMedia() diff --git a/tests/Brickner/Podsumer/PodcastIndexTest.php b/tests/Brickner/Podsumer/PodcastIndexTest.php new file mode 100644 index 0000000..90eb74f --- /dev/null +++ b/tests/Brickner/Podsumer/PodcastIndexTest.php @@ -0,0 +1,13 @@ +assertEquals([], $results); + } +} diff --git a/tests/Brickner/Podsumer/StateTest.php b/tests/Brickner/Podsumer/StateTest.php index c5b72c3..cb75fd2 100644 --- a/tests/Brickner/Podsumer/StateTest.php +++ b/tests/Brickner/Podsumer/StateTest.php @@ -28,6 +28,7 @@ protected function setUp(): void unlink($tmp_main->getStateFilePath()); $this->main = new Main($this->root, $env, [], [], true); + $this->main->setConf('state/media_test', 'podsumer', 'media_dir'); $this->state = new State($this->main); } diff --git a/www/index.php b/www/index.php index 3c9d6eb..0c4fb84 100755 --- a/www/index.php +++ b/www/index.php @@ -18,6 +18,8 @@ use Brickner\Podsumer\Main; use Brickner\Podsumer\OPML; use Brickner\Podsumer\Template; +use Brickner\Podsumer\PodcastIndex; +use Brickner\Podsumer\AdDetection; # Create the application. $main = new Main(PODSUMER_PATH, array_merge($_SERVER, $_ENV), array_merge($_GET, $_POST), $_FILES); @@ -56,6 +58,36 @@ function episodes(array $args): void Template::render($main, 'episodes', $vars); } +#[Route('/search', 'GET', true)] +function search(array $args): void +{ + global $main; + + $page = isset($args['page']) ? max(1, intval($args['page'])) : 1; + $per_page = intval($main->getConf('podsumer', 'items_per_page')) ?: 10; + $q = $args['q'] ?? ''; + + $results = []; + $page_count = 1; + + if (!empty($q)) { + $key = strval($main->getConf('podsumer', 'podcastindex_key')); + $secret = strval($main->getConf('podsumer', 'podcastindex_secret')); + $all = PodcastIndex::search($q, 1000, $key, $secret); + $page_count = max(1, intval(ceil(count($all) / $per_page))); + $results = array_slice($all, ($page - 1) * $per_page, $per_page); + } + + $vars = [ + 'feeds' => $results, + 'q' => $q, + 'page' => $page, + 'page_count' => $page_count + ]; + + Template::render($main, 'search', $vars); +} + /** * Add new feed(s) * Path: /add @@ -72,20 +104,31 @@ function add(array $args): void if (!empty($args['url'])) { $feed = new Feed($args['url']); - $main->getState()->addFeed($feed); + $feed_id = $main->getState()->addFeed($feed); + + // Create a background job to refresh the feed (which will trigger automatic download) + if ($feed_id > 0) { + createRefreshJobForNewFeed($main, $feed_id); + } } # Add an array of feeds via uploaded OPML file. $uploads = $main->getUploads(); - if (count(array_filter($uploads['opml'])) > 2) { + // Only attempt to process OPML file if it was actually uploaded + if (isset($uploads['opml']) && is_array($uploads['opml']) && count(array_filter($uploads['opml'])) > 2) { $feed_urls = OPML::parse($uploads['opml']); foreach ($feed_urls as $url) { $feed = new Feed($url); - $main->getState()->addFeed($feed); + $feed_id = $main->getState()->addFeed($feed); + + // Create a background job to refresh the feed (which will trigger automatic download) + if ($feed_id > 0) { + createRefreshJobForNewFeed($main, $feed_id); + } } } @@ -432,6 +475,39 @@ function refresh(array $args) $main->redirect('/feed?id=' . intval($args['feed_id'])); } +function createRefreshJobForNewFeed(Main $main, int $feed_id): void { + try { + // Create a refresh job for the new feed + $job_id = $main->getState()->createJob('refresh_feed', $feed_id); + + // Start the refresh script in the background + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --feed_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $feed_id, + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var === 0) { + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + } else { + // If we can't start the background job, fail it + $main->getState()->failJob($job_id, 'Failed to start background refresh process'); + } + + } catch (Exception $e) { + // Log error but don't fail the entire add operation + $main->log("Error creating refresh job for new feed $feed_id: " . $e->getMessage()); + } +} + function doRefresh(int $feed_id) { global $main; @@ -473,3 +549,476 @@ function set_playback(array $args): void $main->getState()->setPlaybackPosition(intval($args['item_id']), intval($args['position'])); } +#[Route('/process_ad_detection', 'POST', true)] +function process_ad_detection(array $args): void +{ + global $main; + + // Set JSON header early + header('Content-Type: application/json'); + + // Ensure no output before JSON + if (ob_get_level() > 0) { + ob_clean(); + } + + // Get item_id from JSON body + $input = json_decode(file_get_contents('php://input'), true); + $item_id = intval($input['item_id'] ?? 0); + + if (empty($item_id)) { + echo json_encode(['error' => 'No item ID provided']); + return; + } + + // Check if ad blocking is enabled + if (!$main->getConf('podsumer', 'ad_blocking_enabled')) { + echo json_encode(['error' => 'Ad blocking is not enabled']); + return; + } + + try { + // Create job in database + $job_id = $main->getState()->createJob('process_ads', null, $item_id); + + // Start the ad detection script in the background with proper error handling + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --item_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $item_id, + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + echo json_encode(['error' => 'Failed to start background process']); + return; + } + + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + + // Return immediately + echo json_encode(['success' => true, 'job_id' => $job_id, 'pid' => $pid]); + + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/download_episode', 'POST', true)] +function download_episode(array $args): void +{ + global $main; + + // Get item_id from JSON body + $input = json_decode(file_get_contents('php://input'), true); + $item_id = intval($input['item_id'] ?? 0); + + header('Content-Type: application/json'); + + if (empty($item_id)) { + echo json_encode(['error' => 'No item ID provided']); + return; + } + + try { + // Check if item exists + $item = $main->getState()->getFeedItem($item_id); + if (empty($item)) { + echo json_encode(['error' => 'Item not found']); + return; + } + + // Check if item already has audio + if (!empty($item['audio_file'])) { + echo json_encode(['error' => 'Item already has audio downloaded']); + return; + } + + // Create job in database + $job_id = $main->getState()->createJob('download_item', null, $item_id); + + // Start the download script in the background + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --item_id=%d --job_id=%d --download_only > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $item_id, + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + echo json_encode(['error' => 'Failed to start background process']); + return; + } + + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + + echo json_encode(['success' => true, 'job_id' => $job_id, 'pid' => $pid]); + + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/refresh_all', 'POST', true)] +function refresh_all(array $args): void +{ + global $main; + + header('Content-Type: application/json'); + + try { + // Get feeds that don't already have running refresh jobs + $feeds = $main->getState()->getFeedsWithoutRunningJobs(); + + if (empty($feeds)) { + echo json_encode(['error' => 'No feeds available to refresh (all feeds are already being refreshed or no feeds exist)']); + return; + } + + $job_ids = []; + $failed_feeds = []; + + // Create a refresh_feed job for each feed without a running job + foreach ($feeds as $feed) { + try { + $job_id = $main->getState()->createJob('refresh_feed', $feed['id']); + + // Start the refresh script in the background + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --feed_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $feed['id'], + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + $failed_feeds[] = $feed['name']; + } else { + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + $job_ids[] = $job_id; + } + + } catch (Exception $e) { + $failed_feeds[] = $feed['name'] . ' (' . $e->getMessage() . ')'; + } + } + + $success_count = count($job_ids); + $total_count = count($feeds); + $failed_count = count($failed_feeds); + + $response = [ + 'success' => true, + 'message' => "Started refresh for {$success_count} of {$total_count} available feeds", + 'job_ids' => $job_ids, + 'success_count' => $success_count, + 'total_count' => $total_count + ]; + + if ($failed_count > 0) { + $response['failed_count'] = $failed_count; + $response['failed_feeds'] = $failed_feeds; + $response['message'] .= " ({$failed_count} failed)"; + } + + echo json_encode($response); + + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/refresh_feed', 'POST', true)] +function refresh_feed(array $args): void +{ + global $main; + + // Get feed_id from JSON body + $input = json_decode(file_get_contents('php://input'), true); + $feed_id = intval($input['feed_id'] ?? 0); + + header('Content-Type: application/json'); + + if (empty($feed_id)) { + echo json_encode(['error' => 'No feed ID provided']); + return; + } + + try { + // Create job in database + $job_id = $main->getState()->createJob('refresh_feed', $feed_id); + + // Start the refresh script in the background with proper error handling + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --feed_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $feed_id, + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + echo json_encode(['error' => 'Failed to start background process']); + return; + } + + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + + // Return immediately + echo json_encode(['success' => true, 'job_id' => $job_id, 'pid' => $pid]); + + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/jobs', 'GET', true)] +function jobs(array $args): void +{ + global $main; + + $jobs = $main->getState()->getAllJobs(100); + $running_jobs = $main->getState()->getRunningJobs(); + $job_stats = $main->getState()->getJobStats(); + + $vars = [ + 'jobs' => $jobs, + 'running_jobs' => $running_jobs, + 'job_stats' => $job_stats + ]; + + Template::render($main, 'jobs', $vars); +} + +#[Route('/cancel_job', 'POST', true)] +function cancel_job(array $args): void +{ + global $main; + + header('Content-Type: application/json'); + + $input = json_decode(file_get_contents('php://input'), true); + $job_id = intval($input['job_id'] ?? 0); + + if (empty($job_id)) { + echo json_encode(['error' => 'No job ID provided']); + return; + } + + try { + $success = $main->getState()->cancelJob($job_id); + if ($success) { + echo json_encode(['success' => true]); + } else { + echo json_encode(['error' => 'Failed to cancel job']); + } + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/job_status', 'GET', true)] +function job_status(array $args): void +{ + global $main; + + header('Content-Type: application/json'); + + $job_id = intval($args['job_id'] ?? 0); + + if (empty($job_id)) { + $running_jobs = $main->getState()->getRunningJobs(); + echo json_encode(['running_jobs' => $running_jobs]); + } else { + $job = $main->getState()->getJob($job_id); + echo json_encode(['job' => $job]); + } +} + +#[Route('/process_all_ads', 'POST', true)] +function process_all_ads(array $args): void +{ + global $main; + + header('Content-Type: application/json'); + + try { + // Check if ad blocking is enabled + if (!$main->getConf('podsumer', 'ad_blocking_enabled')) { + echo json_encode(['error' => 'Ad blocking is not enabled in configuration']); + return; + } + + // Get all items that have audio files but don't have both transcript and ad_sections + $items = $main->getState()->getItemsNeedingAdProcessing(); + + if (empty($items)) { + echo json_encode(['error' => 'No items found that need ad processing']); + return; + } + + $job_ids = []; + $failed_items = []; + + // Create an ad processing job for each item + foreach ($items as $item) { + try { + $job_id = $main->getState()->createJob('process_ads', null, $item['id']); + + // Start the ad processing script in the background + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --item_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $item['id'], + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + $failed_items[] = $item['name']; + } else { + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + $job_ids[] = $job_id; + } + + } catch (Exception $e) { + $failed_items[] = $item['name'] . ' (' . $e->getMessage() . ')'; + } + } + + $success_count = count($job_ids); + $total_count = count($items); + $failed_count = count($failed_items); + + $response = [ + 'success' => true, + 'message' => "Started ad processing for {$success_count} of {$total_count} items", + 'job_ids' => $job_ids, + 'success_count' => $success_count, + 'total_count' => $total_count + ]; + + if ($failed_count > 0) { + $response['failed_count'] = $failed_count; + $response['failed_items'] = $failed_items; + $response['message'] .= " ({$failed_count} failed)"; + } + + echo json_encode($response); + + } catch (Exception $e) { + echo json_encode(['error' => $e->getMessage()]); + } +} + +#[Route('/reprocess_ads', 'POST', true)] +function reprocess_ads(array $args): void +{ + global $main; + + header('Content-Type: application/json'); + + if (empty($args['item_id'])) { + echo json_encode(['error' => 'Item ID is required']); + return; + } + + $item_id = intval($args['item_id']); + + // Verify item exists + $item = $main->getState()->getFeedItem($item_id); + if (empty($item)) { + echo json_encode(['error' => 'Item not found']); + return; + } + + // Check if item has a transcript + $transcript = $main->getState()->getItemTranscript($item_id); + if (empty($transcript)) { + echo json_encode(['error' => 'Item has no transcript to reprocess. Please run ad processing first.']); + return; + } + + try { + // Clear existing ad sections to force reprocessing + $main->getState()->clearItemAdSections($item_id); + $main->log("Cleared ad sections for item {$item_id} to force reprocessing"); + + // Create a new ad detection job + $job_id = $main->getState()->createJob('process_ads', null, $item_id); + + // Start the ad processing script in the background + $cmd = sprintf( + 'cd %s && nohup /usr/local/bin/php scripts/refresh_feeds.php --item_id=%d --job_id=%d > /dev/null 2>&1 & echo $!', + PODSUMER_PATH, + $item_id, + $job_id + ); + + $output = []; + $return_var = 0; + exec($cmd, $output, $return_var); + + if ($return_var !== 0) { + $main->getState()->failJob($job_id, 'Failed to start background process'); + echo json_encode(['error' => 'Failed to start reprocessing job']); + return; + } + + $pid = intval(trim($output[0] ?? '0')); + if ($pid > 0) { + $main->getState()->startJob($job_id, $pid); + } + + $main->log("Created ad detection reprocess job {$job_id} for item {$item_id} ({$item['name']})"); + + echo json_encode([ + 'success' => true, + 'message' => 'Ad reprocessing job started successfully', + 'job_id' => $job_id + ]); + + } catch (Exception $e) { + $main->log("Failed to create reprocess job for item {$item_id}: " . $e->getMessage()); + echo json_encode(['error' => 'Failed to create reprocess job: ' . $e->getMessage()]); + } +} +