Kamon/inference.py at main · SakanaAI/Kamon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#!/usr/bin/env python3
"""Standalone inference script for Kamon image-to-text models."""

import os
import sys
import json
import jsonlines
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from absl import app, flags
import numpy as np
import jaconv

# Add current directory to path
sys.path.append(os.path.dirname(__file__))

import kamon_dataset as kd
from vgg_image_to_text_model import VGGImageToTextModel

# Define command line flags
FLAGS = flags.FLAGS
flags.DEFINE_string('checkpoint_path', '', 'Path to checkpoint file (.pt)')
flags.DEFINE_enum(
    'model',
    'auto',
    ['auto', 'vgg', 'vit_decoder'],
    'Model architecture override (auto uses checkpoint config)',
)
flags.DEFINE_integer('start_token', -1, 'Override start token id for vit_decoder (-1 uses checkpoint config)')
flags.DEFINE_string('dataset_subset', 'test', 'Dataset subset: train, val, or test')
flags.DEFINE_boolean('omit_edo', True, 'Whether to omit Edo period images')
flags.DEFINE_string('output_file', 'inference_results.jsonl', 'Output JSONL file')
flags.DEFINE_integer('batch_size', 16, 'Batch size for inference')
flags.DEFINE_string('device', 'auto', 'Device to use (cuda, cpu, or auto)')
flags.DEFINE_boolean('synthetic', False, 'Use synthetic data')
flags.DEFINE_boolean('combined', False, 'Use combined data')
flags.DEFINE_boolean(
    'test_on_real',
    False,
    'Test on real data even if using training from synthetic data'
)
flags.DEFINE_list('omit_from_test_val', [], 'Omit these classes from test/val')
flags.DEFINE_string('training_parsed', None, 'Custom parsed data used for training')
flags.DEFINE_string('training_translated', None, 'Custom translated data used for training')
flags.DEFINE_string('training_descriptions', None, 'Custom descriptions used for training')
flags.DEFINE_string('parsed', None, 'Custom parsed data')
flags.DEFINE_string('translated', None, 'Custom translated data')
flags.DEFINE_string('descriptions', None, 'Custom descriptions')
flags.DEFINE_bool(
    'use_bigrams',
    False,
    'Constrain decoding with training bigrams',
)


def load_checkpoint(checkpoint_path, device, *, model_override: str = 'auto', start_token_override: int = -1):
    """Load model checkpoint and return model and metadata."""
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint file not found: {checkpoint_path}")

    print(f"Loading checkpoint from: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, map_location=device)

    # Extract model configuration from checkpoint
    vocab_size = checkpoint['vocab_size']
    max_seq_len = checkpoint['max_seq_len']
    end_token = checkpoint['end_token']
    label_to_expr = checkpoint['label_to_expr']

    # Try to get model config, with fallbacks for older checkpoints
    model_config = checkpoint.get('config', {})
    image_size = model_config.get('image_size', 224)

    checkpoint_model_name = model_config.get('model', 'vgg')
    model_name = model_override if model_override != 'auto' else checkpoint_model_name
    if model_name not in {'vgg', 'vit_decoder'}:
        print(f"Warning: Unknown model '{model_name}', falling back to 'vgg'")
        model_name = 'vgg'

    checkpoint_start_token = model_config.get('start_token', 0)
    start_token = start_token_override if start_token_override != -1 else checkpoint_start_token

    if model_name == 'vgg':
        hidden_dim = model_config.get('hidden_dim', 512)
        also_train_vgg = model_config.get('also_train_vgg', False)
        use_masks = model_config.get('use_masks', True)  # Default to True for backward compatibility

        # Infer ngram_length from the feature_combiner input dimension if not in config
        if 'ngram_length' in model_config:
            ngram_length = model_config['ngram_length']
        else:
            # Infer from feature_combiner.0.weight shape
            # Input dim = vgg_feature_dim + (ngram_length - 1) * (vgg_feature_dim + vocab_size)
            # Assuming vgg_feature_dim = 4096
            vgg_feature_dim = 4096
            feature_combiner_input_dim = checkpoint['model_state_dict']['feature_combiner.0.weight'].shape[1]

            # Solve: feature_combiner_input_dim = vgg_feature_dim + (ngram_length - 1) * (vgg_feature_dim + vocab_size)
            # Rearrange: (feature_combiner_input_dim - vgg_feature_dim) = (ngram_length - 1) * (vgg_feature_dim + vocab_size)
            # ngram_length = 1 + (feature_combiner_input_dim - vgg_feature_dim) / (vgg_feature_dim + vocab_size)
            ngram_length = 1 + (feature_combiner_input_dim - vgg_feature_dim) // (vgg_feature_dim + vocab_size)
            print(f"Inferred ngram_length = {ngram_length} from checkpoint dimensions")

        # Create model
        model = VGGImageToTextModel(
            vocab_size=vocab_size,
            max_seq_len=max_seq_len,
            image_size=image_size,
            ngram_length=ngram_length,
            hidden_dim=hidden_dim,
            also_train_vgg=also_train_vgg,
            use_masks=use_masks,
        )
    elif model_name == 'vit_decoder':
        from vit_model import DecoderImageCaptioner

        model = DecoderImageCaptioner(
            encoder_name=model_config.get('vit_encoder_name', 'vit_base_patch16_224'),
            seq_len=max_seq_len,
            vocab_size=vocab_size,
            n_heads=model_config.get('vit_n_heads', 8),
            d_model=model_config.get('vit_d_model', 512),
            n_layers=model_config.get('vit_n_layers', 6),
            dropout=model_config.get('vit_dropout', 0.1),
            token_dropout=model_config.get('vit_token_dropout', 0.0),
            train_backbone=model_config.get('vit_train_backbone', False),
            enc_proj_rank=model_config.get('vit_enc_proj_rank', 0),
        )
    else:
        raise ValueError(f"Unknown model: {model_name}")

    # Load model weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    metadata = {
        'model': model_name,
        'vocab_size': vocab_size,
        'max_seq_len': max_seq_len,
        'end_token': end_token,
        'label_to_expr': label_to_expr,
        'image_size': image_size,
        'start_token': start_token,
        'step': checkpoint.get('step', 'unknown'),
        'epoch': checkpoint.get('epoch', 'unknown'),
        'val_loss': checkpoint.get('val_loss', checkpoint.get('loss', 'unknown'))
    }

    return model, metadata


def generate_tokens(model, model_name, images, end_token, *, start_token: int, max_length: int, bigrams=None):
    if model_name == 'vgg':
        tokens, _ = model.generate(images, end_token, max_length=max_length, bigrams=bigrams)
        return tokens
    if model_name == 'vit_decoder':
        return model.generate(images, end_token=end_token, start_token=start_token, max_length=max_length)
    raise ValueError(f"Unknown model: {model_name}")


def normalize_description(desc):
    """Normalize description by removing whitespace and converting to hiragana.

    Args:
        desc: Description string

    Returns:
        Normalized description with spaces removed and katakana converted to hiragana
    """
    # Remove all whitespace
    desc = desc.replace(' ', '').replace('\t', '').replace('\n', '')
    # Convert katakana to hiragana for consistent comparison
    desc = jaconv.kata2hira(desc)
    return desc


def build_description_to_images_map(train_metadata):
    """Build a mapping from descriptions to lists of image paths in training data.

    Args:
        train_metadata: List of metadata dicts from training dataset

    Returns:
        Dictionary mapping normalized description strings to lists of image paths
    """
    desc_to_images = {}
    for item in train_metadata:
        desc = item.get('description', '')
        img_path = item.get('path', '')
        if desc and img_path:
            # Normalize description by removing spaces for consistent lookup
            normalized_desc = normalize_description(desc)
            if normalized_desc not in desc_to_images:
                desc_to_images[normalized_desc] = []
            desc_to_images[normalized_desc].append(img_path)
    return desc_to_images


def run_inference(
    model,
    dataloader,
    device,
    label_to_expr,
    end_token,
    dataset_metadata,
    train_desc_to_images,
    bigrams,
    *,
    model_name: str,
    start_token: int,
):
    """Run inference on dataset and return results."""
    model.eval()
    results = []

    print(f"Running inference on {len(dataloader)} batches...")

    with torch.no_grad():
        for batch_idx, (images, target_tokens) in enumerate(dataloader):
            images = images.to(device)
            target_tokens = target_tokens.to(device)

            # Generate predictions
            pred_tokens = generate_tokens(
                model,
                model_name,
                images,
                end_token,
                start_token=start_token,
                max_length=target_tokens.shape[1],
                bigrams=bigrams,
            )

            batch_size = images.size(0)

            # Process each example in the batch
            for i in range(batch_size):
                # Get predicted tokens
                pred_tokens_list = pred_tokens[i].cpu().tolist()
                # Find end token position in predictions
                try:
                    end_pos = pred_tokens_list.index(end_token)
                    pred_tokens_list = pred_tokens_list[:end_pos]
                except ValueError:
                    pass

                predicted_description = ' '.join([label_to_expr.get(token, f'<UNK:{token}>') for token in pred_tokens_list])

                # Get ground truth tokens
                gt_tokens_list = target_tokens[i].cpu().tolist()
                # Find end token position in ground truth
                try:
                    gt_end_pos = gt_tokens_list.index(end_token)
                    gt_tokens_list = gt_tokens_list[:gt_end_pos]
                except ValueError:
                    pass

                reference_description = ' '.join([label_to_expr.get(token, f'<UNK:{token}>') for token in gt_tokens_list])

                # Get image path and other metadata from the dataset
                example_idx = batch_idx * dataloader.batch_size + i
                if example_idx < len(dataset_metadata):
                    item_metadata = dataset_metadata[example_idx]
                    image_path = item_metadata.get('path', '')
                    translation = item_metadata.get('translation', '')
                else:
                    image_path = ''
                    translation = ''

                # Lookup training images with same description as reference
                # Normalize descriptions by removing spaces before lookup
                normalized_reference = normalize_description(reference_description)
                train_images_reference = train_desc_to_images.get(normalized_reference, [])

                # Lookup training images with same description as predicted
                normalized_predicted = normalize_description(predicted_description)
                train_images_predicted = train_desc_to_images.get(normalized_predicted, [])

                result = {
                    'reference': reference_description,
                    'predicted': predicted_description,
                    'image': image_path,
                    'translation': translation,
                    'train_images_reference': train_images_reference,
                    'train_images_predicted': train_images_predicted,
                    'reference_tokens': gt_tokens_list,
                    'predicted_tokens': pred_tokens_list,
                    'batch_idx': batch_idx,
                    'example_idx': i
                }

                results.append(result)

            # Progress reporting
            if (batch_idx + 1) % 10 == 0:
                print(f"Processed {batch_idx + 1}/{len(dataloader)} batches")

    return results


def main(argv):
    del argv  # Unused

    # Validate required arguments
    if not FLAGS.checkpoint_path:
        print("Error: --checkpoint_path is required")
        return 1

    if FLAGS.dataset_subset not in ['train', 'val', 'test']:
        print("Error: --dataset_subset must be one of: train, val, test")
        return 1

    # Set device
    if FLAGS.device == 'auto':
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    else:
        device = torch.device(FLAGS.device)

    print(f"Using device: {device}")

    try:
        # Load checkpoint and create model
        model, checkpoint_metadata = load_checkpoint(
            FLAGS.checkpoint_path,
            device,
            model_override=FLAGS.model,
            start_token_override=FLAGS.start_token,
        )

        print(f"Loaded model from step {checkpoint_metadata['step']}, epoch {checkpoint_metadata['epoch']}")
        print(f"Model: {checkpoint_metadata['model']}")
        print(f"Vocabulary size: {checkpoint_metadata['vocab_size']}")
        print(f"Max sequence length: {checkpoint_metadata['max_seq_len']}")
        if checkpoint_metadata['val_loss'] != 'unknown':
            print(f"Validation loss: {checkpoint_metadata['val_loss']}")

        # Load training dataset to build description-to-images mapping
        if FLAGS.synthetic:
            print("Using synthetic data...")
            parsed = "synthetic_examples/synthetic_parsed.jsonl"
            translated = "synthetic_examples/synthetic_translated.jsonl"
            descriptions = "synthetic_examples/synthetic.jsonl"
            kd.reload_data(parsed, translated, descriptions)
        elif FLAGS.combined:
            print("Using combined data...")
            parsed = "for_paper/combined_parsed.jsonl"
            translated = "for_paper/combined_translated.jsonl"
            descriptions = "for_paper/combined.jsonl"
            kd.reload_data(parsed, translated, descriptions)
        elif FLAGS.training_parsed and FLAGS.training_translated and FLAGS.training_descriptions:
            print(f"Loading the data used for training from ...{FLAGS.training_parsed}")
            parsed = FLAGS.parsed
            translated = FLAGS.translated
            descriptions = FLAGS.descriptions
            kd.reload_data(parsed, translated, descriptions)
        print(f"Loading training dataset for description lookup...")
        train_dataset = kd.KamonDataset(
            division="train",
            image_size=checkpoint_metadata['image_size'],
            num_augmentations=0,  # No augmentation needed for lookup
            one_hot=False,
            omit_edo=FLAGS.omit_edo,
        )
        bigrams = train_dataset.bigrams if FLAGS.use_bigrams else None
        train_desc_to_images = build_description_to_images_map(train_dataset.metadata)
        print(f"Built training description map with {len(train_desc_to_images)} unique descriptions")

        # Load evaluation dataset
        expr_to_label = None
        if FLAGS.test_on_real:
            print("Testing on real data...")
            expr_to_label = train_dataset.expr_to_label
            parsed = kd.ORIG_PARSED
            translated = kd.ORIG_TRANSLATED
            descriptions = kd.ORIG_DESCRIPTIONS
            kd.reload_data(parsed, translated, descriptions)
        elif FLAGS.parsed and FLAGS.translated and FLAGS.descriptions:
            print(f"Testing on custom data...{FLAGS.parsed}")
            expr_to_label = train_dataset.expr_to_label
            parsed = FLAGS.parsed
            translated = FLAGS.translated
            descriptions = FLAGS.descriptions
            kd.reload_data(parsed, translated, descriptions)

        if bigrams:
            expr_to_label = train_dataset.expr_to_label
        print(f"Loading {FLAGS.dataset_subset} dataset (omit_edo={FLAGS.omit_edo})...")
        dataset = kd.KamonDataset(
            division=FLAGS.dataset_subset,
            image_size=checkpoint_metadata['image_size'],
            num_augmentations=0,  # No augmentation for inference
            one_hot=False,
            omit_edo=FLAGS.omit_edo,
            expr_to_label=expr_to_label,
            omit_from_test_val=FLAGS.omit_from_test_val,
        )
        if FLAGS.test_on_real:
            checkpoint_metadata['label_to_expr'] = dataset.label_to_expr

        print(f"Dataset size: {len(dataset)} examples")

        # Create dataloader
        dataloader = DataLoader(
            dataset,
            batch_size=FLAGS.batch_size,
            shuffle=False,  # Keep original order for inference
            num_workers=4,
            pin_memory=True
        )

        # Get dataset metadata for image paths and translations
        dataset_metadata = dataset.metadata

        # Run inference
        results = run_inference(
            model,
            dataloader,
            device,
            checkpoint_metadata['label_to_expr'],
            checkpoint_metadata['end_token'],
            dataset_metadata,
            train_desc_to_images,
            bigrams,
            model_name=checkpoint_metadata['model'],
            start_token=checkpoint_metadata['start_token'],
        )

        # Save results
        print(f"Saving results to: {FLAGS.output_file}")
        with jsonlines.open(FLAGS.output_file, 'w') as writer:
            for result in results:
                writer.write(result)

        # Calculate and print summary statistics
        total_examples = len(results)
        correct_predictions = sum(1 for r in results if r['reference'] == r['predicted'])
        accuracy = correct_predictions / total_examples if total_examples > 0 else 0

        print(f"\nInference completed!")
        print(f"Total examples: {total_examples}")
        print(f"Correct predictions: {correct_predictions}")
        print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
        print(f"Results saved to: {FLAGS.output_file}")

        return 0

    except Exception as e:
        print(f"Error during inference: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == '__main__':
    app.run(main)