diff --git a/README.md b/README.md index bb196fe5..7e80ba79 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,56 @@ but allows developers to run Python code on their native system. 4. `npm run dev` 6. When finished, run `docker compose stop` +## Importing Recordings + +The `importRecordings` management command allows you to bulk import WAV files from a +directory. It will: + +- Recursively search for all `.wav` and `.WAV` files in the specified directory +- Extract GUANO metadata from each file (with filename fallback if metadata is missing) +- Create Recording objects with the extracted metadata +- Generate spectrograms synchronously for each recording +- Log progress to the terminal + +### Usage + +**Basic usage with Docker Compose (with bind mount):** + +```bash +docker compose run --rm -v /path/to/wav/files:/data django ./manage.py importRecordings /data +``` + +**With options:** + +```bash +docker compose run --rm -v /path/to/wav/files:/data django ./manage.py importRecordings /data \ + --owner username \ + --public \ + --limit 10 +``` + +**Options:** + +- `directory` (required): Path to directory containing WAV files +- `--owner USERNAME`: Username of the owner for the recordings (defaults to first superuser) +- `--public`: Make imported recordings public +- `--limit N`: Limit the number of WAV files to import (useful for testing) + +**Example with bind mount:** + +```bash +docker compose run --rm \ + -v /media/bryon.lewis/Elements/BATSAI/training_files:/data \ + django ./manage.py importRecordings /data --limit 5 +``` + +This will: + +1. Mount your host directory `/media/bryon.lewis/Elements/BATSAI/training_files` to `/data` in the container +2. Import only the first 5 WAV files found +3. Use the first superuser as the owner +4. Create private recordings (unless `--public` is specified) + ## Testing ### Initial Setup for Testing @@ -91,9 +141,9 @@ Individual test environments may be selectively run. This also allows additional options to be be added. Useful sub-commands include: -* `uv run tox -e lint`: Run only the style checks -* `uv run tox -e type`: Run only the type checks -* `uv run tox -e test`: Run only the pytest-driven tests +- `uv run tox -e lint`: Run only the style checks +- `uv run tox -e type`: Run only the type checks +- `uv run tox -e test`: Run only the pytest-driven tests To automatically reformat all code to comply with some (but not all) of the style checks, run `uv run tox -e format`. diff --git a/bats_ai/core/management/commands/importRecordings.py b/bats_ai/core/management/commands/importRecordings.py new file mode 100644 index 00000000..4f5558b9 --- /dev/null +++ b/bats_ai/core/management/commands/importRecordings.py @@ -0,0 +1,206 @@ +import logging +from pathlib import Path + +from django.contrib.auth.models import User +from django.contrib.gis.geos import Point +from django.core.files import File +from django.core.management.base import BaseCommand +from django.utils import timezone + +from bats_ai.core.models import Recording +from bats_ai.core.utils.guano_utils import extract_guano_metadata +from bats_ai.tasks.tasks import recording_compute_spectrogram + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = 'Import WAV files from a directory, extract GUANO metadata, and create recordings' + + def add_arguments(self, parser): + parser.add_argument( + 'directory', + type=str, + help='Directory path containing WAV files to import', + ) + parser.add_argument( + '--owner', + type=str, + help='Username of the owner for the recordings (defaults to first superuser)', + ) + parser.add_argument( + '--public', + action='store_true', + help='Make imported recordings public', + ) + parser.add_argument( + '--limit', + type=int, + help='Limit the number of WAV files to import (useful for testing)', + ) + + def handle(self, *args, **options): + import matplotlib + + matplotlib.use('Agg') + + directory_path = Path(options['directory']) + owner_username = options.get('owner') + is_public = options.get('public', False) + limit = options.get('limit') + + # Validate directory + if not directory_path.exists(): + self.stdout.write(self.style.ERROR(f'Directory does not exist: {directory_path}')) + return + + if not directory_path.is_dir(): + self.stdout.write(self.style.ERROR(f'Path is not a directory: {directory_path}')) + return + + # Get or find owner + if owner_username: + try: + owner = User.objects.get(username=owner_username) + except User.DoesNotExist: + self.stdout.write(self.style.ERROR(f'User not found: {owner_username}')) + return + else: + # Default to first superuser + owner = User.objects.filter(is_superuser=True).first() + if not owner: + self.stdout.write( + self.style.ERROR( + 'No superuser found. Please specify --owner or create a superuser.' + ) + ) + return + self.stdout.write(self.style.WARNING(f'Using default owner: {owner.username}')) + + # Find all WAV files + wav_files = list(directory_path.rglob('*.wav', case_sensitive=False)) + + if not wav_files: + self.stdout.write( + self.style.WARNING(f'No WAV files found in directory: {directory_path}') + ) + return + + # Apply limit if specified + total_files = len(wav_files) + if limit and limit > 0: + wav_files = wav_files[:limit] + self.stdout.write( + self.style.SUCCESS( + f'Found {total_files} WAV file(s), importing first {len(wav_files)}' + ) + ) + else: + self.stdout.write(self.style.SUCCESS(f'Found {len(wav_files)} WAV file(s) to import')) + + # Process each file + successful = 0 + failed = 0 + + for idx, wav_file in enumerate(wav_files, 1): + self.stdout.write(f'\n[{idx}/{len(wav_files)}] Processing: {wav_file.name}') + + try: + # Extract GUANO metadata + self.stdout.write(' Extracting GUANO metadata...') + metadata = extract_guano_metadata(wav_file, check_filename=True) + + # Extract date and time from metadata or file modification time + recorded_date = None + recorded_time = None + + if metadata.get('nabat_activation_start_time'): + dt = metadata['nabat_activation_start_time'] + recorded_date = dt.date() + recorded_time = dt.time() + else: + # Use file modification time as fallback + mtime = timezone.datetime.fromtimestamp( + wav_file.stat().st_mtime, tz=timezone.get_current_timezone() + ) + recorded_date = mtime.date() + recorded_time = mtime.time() + self.stdout.write( + self.style.WARNING( + ' No activation start time in metadata, using file modification time' + ) + ) + + # Create Point from latitude/longitude if available + point = None + if metadata.get('nabat_latitude') and metadata.get('nabat_longitude'): + point = Point(metadata['nabat_longitude'], metadata['nabat_latitude']) + + # Get grid cell ID + grts_cell_id = None + if metadata.get('nabat_grid_cell_grts_id'): + try: + grts_cell_id = int(metadata['nabat_grid_cell_grts_id']) + except (ValueError, TypeError): + pass + + # Convert species list to string if present + species_list_str = None + if metadata.get('nabat_species_list'): + species_list_str = ','.join(metadata['nabat_species_list']) + + # Create recording + self.stdout.write(' Creating recording...') + with open(wav_file, 'rb') as f: + recording = Recording( + name=wav_file.name, + owner=owner, + audio_file=File(f, name=wav_file.name), + recorded_date=recorded_date, + recorded_time=recorded_time, + equipment=None, # Not in GUANO metadata + grts_cell_id=grts_cell_id, + recording_location=point, + public=is_public, + comments=metadata.get('nabat_comments'), + detector=metadata.get('nabat_detector_type'), + software=metadata.get('nabat_software_type'), + site_name=metadata.get('nabat_site_name'), + species_list=species_list_str, + unusual_occurrences=metadata.get('nabat_unusual_occurrences'), + ) + recording.save() + + self.stdout.write(self.style.SUCCESS(f' Created recording ID: {recording.pk}')) + + # Generate spectrogram synchronously + self.stdout.write(' Generating spectrogram...') + try: + result = recording_compute_spectrogram(recording.pk) + self.stdout.write( + self.style.SUCCESS( + f' Spectrogram generated (ID: {result.get("spectrogram_id")})' + ) + ) + except Exception as e: + self.stdout.write( + self.style.ERROR(f' Failed to generate spectrogram: {str(e)}') + ) + logger.exception('Error generating spectrogram', exc_info=e) + raise e + + successful += 1 + self.stdout.write(self.style.SUCCESS(f' ✓ Successfully imported: {wav_file.name}')) + + except Exception as e: + failed += 1 + self.stdout.write( + self.style.ERROR(f' ✗ Failed to import {wav_file.name}: {str(e)}') + ) + logger.exception('Error importing file', exc_info=e) + + # Summary + self.stdout.write('\n' + '=' * 60) + self.stdout.write( + self.style.SUCCESS(f'Import complete: {successful} successful, {failed} failed') + ) diff --git a/bats_ai/core/utils/__init__.py b/bats_ai/core/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bats_ai/core/utils/guano_utils.py b/bats_ai/core/utils/guano_utils.py new file mode 100644 index 00000000..a727593d --- /dev/null +++ b/bats_ai/core/utils/guano_utils.py @@ -0,0 +1,189 @@ +from datetime import datetime +from pathlib import Path +import re + +from guano import GuanoFile + + +def parse_datetime(datetime_str: str) -> datetime | None: + """Parse datetime string from GUANO metadata. + + Tries multiple formats: + 1. Custom format: '%Y%m%dT%H%M%S' + 2. ISO format + + Args: + datetime_str: String representation of datetime + + Returns: + datetime object or None if parsing fails + """ + if datetime_str: + try: + # Try parsing using the custom format + return datetime.strptime(datetime_str, '%Y%m%dT%H%M%S') + except ValueError: + try: + # Try parsing using ISO format + return datetime.fromisoformat(datetime_str) + except ValueError: + # If both formats fail, return None + return None + return None + + +def extract_metadata_from_filename(filename: str) -> dict: + # Remove file extension if present + filename_without_ext = Path(filename).stem + + regex_pattern = re.compile(r'^(\d+)_(.+)_(\d{8})_(\d{6})(?:_(.*))?$') + match = regex_pattern.match(filename_without_ext) + + if not match: + return {} + + # Extract matched groups + cell_id = match.group(1) + label_name = match.group(2) + date_str = match.group(3) + timestamp_str = match.group(4) + + metadata = {} + + # Extract grid cell ID + if cell_id: + try: + metadata['nabat_grid_cell_grts_id'] = str(int(cell_id)) + except ValueError: + pass + + # Extract date and time + if date_str and len(date_str) == 8 and timestamp_str and len(timestamp_str) == 6: + try: + # Convert YYYYMMDD to date components + year = int(date_str[0:4]) + month = int(date_str[4:6]) + day = int(date_str[6:8]) + + # Convert HHMMSS to time components + hour = int(timestamp_str[0:2]) + minute = int(timestamp_str[2:4]) + second = int(timestamp_str[4:6]) + + # Create datetime object + activation_time = datetime(year, month, day, hour, minute, second) + metadata['nabat_activation_start_time'] = activation_time + except (ValueError, IndexError): + pass + + # Extract quadrant if labelName is a valid quadrant + if label_name and label_name.upper() in ['SW', 'NE', 'NW', 'SE']: + metadata['quadrant'] = label_name.upper() + + return metadata + + +def extract_guano_metadata(file_path: str | Path, check_filename: bool = False) -> dict: + """Extract GUANO metadata from a WAV file. + + Args: + file_path: Path to the WAV file with GUANO metadata + + Returns: + Dictionary containing extracted NABat metadata fields: + - nabat_grid_cell_grts_id: str | None + - nabat_latitude: float | None + - nabat_longitude: float | None + - nabat_site_name: str | None + - nabat_activation_start_time: datetime | None + - nabat_activation_end_time: datetime | None + - nabat_software_type: str | None + - nabat_species_list: list[str] | None + - nabat_comments: str | None + - nabat_detector_type: str | None + - nabat_unusual_occurrences: str | None + + Raises: + Exception: If the file cannot be read or processed + """ + file_path = Path(file_path) + + # Read GUANO metadata from the file + gfile = GuanoFile(str(file_path)) + + # Extract required NABat fields + nabat_fields = { + 'nabat_grid_cell_grts_id': gfile.get('NABat|Grid Cell GRTS ID', None), + 'nabat_latitude': gfile.get('NABat|Latitude', None), + 'nabat_longitude': gfile.get('NABat|Longitude', None), + 'nabat_site_name': gfile.get('NABat|Site Name', None), + } + + # Fix longitude if positive (individuals don't put the - in the longitude) + # GUANO metadata is supposed to be WGS84, but some individuals don't put the - in the longitude. + if nabat_fields['nabat_longitude']: + try: + longitude = float(nabat_fields['nabat_longitude']) + if longitude > 0: + nabat_fields['nabat_longitude'] = longitude * -1 + else: + nabat_fields['nabat_longitude'] = longitude + except (ValueError, TypeError): + nabat_fields['nabat_longitude'] = None + + # Convert latitude to float if present + if nabat_fields['nabat_latitude']: + try: + nabat_fields['nabat_latitude'] = float(nabat_fields['nabat_latitude']) + except (ValueError, TypeError): + nabat_fields['nabat_latitude'] = None + + # Extract additional fields with conditionals + additional_fields = { + 'nabat_activation_start_time': ( + parse_datetime(gfile.get('NABat|Activation start time', None)) + if 'NABat|Activation start time' in gfile + else None + ), + 'nabat_activation_end_time': ( + parse_datetime(gfile.get('NABat|Activation end time', None)) + if 'NABat|Activation end time' in gfile + else None + ), + 'nabat_software_type': gfile.get('NABat|Software type', None), + 'nabat_species_list': ( + [s.strip() for s in gfile.get('NABat|Species List', '').split(',') if s.strip()] + if gfile.get('NABat|Species List', '') + else None + ), + 'nabat_comments': gfile.get('NABat|Comments', None), + 'nabat_detector_type': gfile.get('NABat|Detector type', None), + 'nabat_unusual_occurrences': gfile.get('NABat|Unusual occurrences', '') or None, + } + + # Combine all extracted fields + metadata = {**nabat_fields, **additional_fields} + + # If GUANO metadata is missing key fields, try to extract from filename + # as fallback + file_path_obj = Path(file_path) + if check_filename: + filename_metadata = extract_metadata_from_filename(file_path_obj.name) + # Only fill in missing values from filename, don't overwrite existing + # GUANO metadata + if filename_metadata: + # Fill in grid cell ID if missing + grid_cell_id = filename_metadata.get('nabat_grid_cell_grts_id') + if not metadata.get('nabat_grid_cell_grts_id') and grid_cell_id: + metadata['nabat_grid_cell_grts_id'] = grid_cell_id + + # Fill in activation start time if missing + activation_time = filename_metadata.get('nabat_activation_start_time') + if not metadata.get('nabat_activation_start_time') and activation_time: + metadata['nabat_activation_start_time'] = activation_time + + # Store quadrant if found (for potential use in getting location) + if filename_metadata.get('quadrant'): + metadata['quadrant'] = filename_metadata['quadrant'] + + return metadata diff --git a/bats_ai/core/views/guanometadata.py b/bats_ai/core/views/guanometadata.py index 23c15bd2..944e3a14 100644 --- a/bats_ai/core/views/guanometadata.py +++ b/bats_ai/core/views/guanometadata.py @@ -2,11 +2,12 @@ import logging from django.http import HttpRequest, JsonResponse -from guano import GuanoFile from ninja import File, Schema from ninja.files import UploadedFile from ninja.pagination import RouterPaginated +from bats_ai.core.utils.guano_utils import extract_guano_metadata + router = RouterPaginated() logger = logging.getLogger(__name__) @@ -34,58 +35,10 @@ def default_data( audio_file: File[UploadedFile], ): try: - # Read GUANO metadata from the file name provided - gfile = GuanoFile(audio_file.file.name) - - # Extract required NABat fields - nabat_fields = { - 'nabat_grid_cell_grts_id': gfile.get('NABat|Grid Cell GRTS ID', None), - 'nabat_latitude': (gfile.get('NABat|Latitude', None)), - 'nabat_longitude': (gfile.get('NABat|Longitude', None)), - 'nabat_site_name': gfile.get('NABat|Site Name', None), - } - if ( - nabat_fields['nabat_longitude'] and float(nabat_fields['nabat_longitude']) > 0 - ): # individuals don't put the - in the longitude - nabat_fields['nabat_longitude'] = str(float(nabat_fields['nabat_longitude']) * -1) - # Extract additional fields with conditionals - additional_fields = { - 'nabat_activation_start_time': ( - parse_datetime(gfile.get('NABat|Activation start time', None)) - if 'NABat|Activation start time' in gfile - else None - ), - 'nabat_activation_end_time': ( - parse_datetime(gfile.get('NABat|Activation end time', None)) - if 'NABat|Activation end time' in gfile - else None - ), - 'nabat_software_type': gfile.get('NABat|Software type', None), - 'nabat_species_list': gfile.get('NABat|Species List', '').split(','), - 'nabat_comments': gfile.get('NABat|Comments', None), - 'nabat_detector_type': gfile.get('NABat|Detector type', None), - 'nabat_unusual_occurrences': gfile.get('NABat|Unusual occurrences', ''), - } - - # Combine all extracted fields - metadata = {**nabat_fields, **additional_fields} - + # Extract GUANO metadata using utility function + metadata = extract_guano_metadata(audio_file.file.name) return JsonResponse(metadata, safe=False) except Exception as e: + logger.exception('Error extracting GUANO metadata', exc_info=e) return JsonResponse({'error': str(e)}, status=500) - - -def parse_datetime(datetime_str): - if datetime_str: - try: - # Try parsing using the custom format - return datetime.strptime(datetime_str, '%Y%m%dT%H%M%S') - except ValueError: - try: - # Try parsing using ISO format - return datetime.fromisoformat(datetime_str) - except ValueError: - # If both formats fail, return None or handle the error accordingly - return None - return None