tz_kaggle/LLM_models.py at main · tztechno/tz_kaggle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
[model standard form for training]

        # Get current example
        inputs = self.train_env.get_current_input()
        target_ids = self.train_env.get_current_target()

        # Forward pass with teacher forcing for supervised learning
        outputs = self.model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            labels=target_ids
        )

        # Calculate standard seq2seq loss (teacher forcing loss)
        loss = outputs.loss

##############################################
[deepseek r1 training ]

    def get_current_input(self):
        """
        Get current input for the model, adapting from existing preprocess_function
        """
        questions = self.questions[self.current_idx]

        # DeepSeek R1 specific input formatting
        inputs = f"Question: {questions}"

        # Tokenize with similar parameters to original preprocess_function
        encoding = self.tokenizer(
            inputs,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_length,  # Use class-defined max length
            padding='max_length'
        )

        return {k: v.to(device) for k, v in encoding.items()}

    def get_current_target(self):
        """
        Get current target for the model, adapting from existing preprocess_function
        """
        answers = self.answers[self.current_idx]

        # Tokenize target with similar parameters
        target_encoding = self.tokenizer(
            text_target=answers,
            return_tensors='pt',
            truncation=True,
            max_length=self.max_length,
            padding='max_length'
        )

        return target_encoding['input_ids'].to(device)

##############################################
[T5 training loop]

    def get_current_input(self):
        question = self.questions[self.current_idx]
        # For T5, prefix the input with a task-specific prefix
        encoding = self.tokenizer(f"answer: {question}", return_tensors='pt', truncation=True, max_length=128)
        return {k: v.to(device) for k, v in encoding.items()}

    def get_current_target(self):
        answer = self.answers[self.current_idx]
        target_encoding = self.tokenizer(text_target=answer, return_tensors='pt', truncation=True, max_length=128)
        return target_encoding['input_ids'].to(device)

##############################################