-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processor.py
More file actions
268 lines (214 loc) · 10.3 KB
/
data_processor.py
File metadata and controls
268 lines (214 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
"""
数据处理模块 - 负责数据清洗、分析和指标计算,支持多指标分析和投资决策
"""
import pandas as pd
import numpy as np
import logging
from datetime import datetime
from typing import Dict, List, Any
logger = logging.getLogger(__name__)
class DataProcessor:
"""数据处理器"""
def __init__(self):
"""初始化数据处理器"""
pass
def analyze_data(self, df: pd.DataFrame, bond_yield_data: Dict = None) -> Dict[str, Any]:
"""
分析CSV数据,提取关键指标,整合国债收益率数据
Args:
df: 原始CSV数据
bond_yield_data: 国债收益率数据字典(可选)
Returns:
Dict: 包含分析结果的字典
"""
try:
logger.info("开始数据分析...")
# 数据预处理
processed_df = self._preprocess_data(df)
# 计算基础指标
metrics = self._calculate_metrics(processed_df)
# 整合国债收益率数据
if bond_yield_data:
metrics.update({
'bond_yield': bond_yield_data.get('current_yield'),
'bond_yield_change': bond_yield_data.get('yield_change'),
'bond_date': bond_yield_data.get('date'),
'bond_name': bond_yield_data.get('bond_name')
})
# 计算股息率与国债收益率的对比
if metrics.get('current_rate') and metrics.get('bond_yield'):
metrics['dividend_bond_spread'] = metrics['current_rate'] - metrics['bond_yield']
metrics['dividend_bond_ratio'] = metrics['current_rate'] / metrics['bond_yield'] if metrics['bond_yield'] != 0 else None
# 生成投资决策建议
investment_advice = self._generate_investment_advice(metrics)
metrics['investment_advice'] = investment_advice
# 生成分析结果
analysis_result = {
'raw_data': df,
'processed_data': processed_df,
'metrics': metrics,
'analysis_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'bond_yield_data': bond_yield_data
}
logger.info("数据分析完成")
return analysis_result
except Exception as e:
logger.error(f"数据分析失败: {str(e)}")
raise Exception(f"数据分析过程中发生错误: {str(e)}")
def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
数据预处理
Args:
df: 原始数据框
Returns:
pd.DataFrame: 处理后的数据框
"""
# 创建数据副本
processed_df = df.copy()
# 数据类型转换和清理
# 注意:这里需要根据实际Excel结构调整列名
if '日期Date' in processed_df.columns:
processed_df['date'] = pd.to_datetime(processed_df['日期Date'], format='%Y%m%d', errors='coerce')
processed_df = processed_df.dropna(subset=['date'])
if '股息率2(计算用股本)D/P2' in processed_df.columns:
processed_df['dividend_rate'] = pd.to_numeric(
processed_df['股息率2(计算用股本)D/P2'], errors='coerce'
)
processed_df = processed_df.dropna(subset=['dividend_rate'])
# 添加PE列处理
if '市盈率2(计算用股本)P/E2' in processed_df.columns:
processed_df['pe_ratio'] = pd.to_numeric(
processed_df['市盈率2(计算用股本)P/E2'], errors='coerce'
)
# 按日期降序排序(最新日期在前)
if 'date' in processed_df.columns:
processed_df = processed_df.sort_values('date', ascending=False) # 按时间降序排列,最新日期在前
# 只保留最近15天的数据
if len(processed_df) > 15:
processed_df = processed_df.head(15)
logger.debug(f"预处理后数据形状: {processed_df.shape}")
return processed_df
def _calculate_metrics(self, df: pd.DataFrame) -> Dict[str, float]:
"""
计算关键指标
Args:
df: 处理后的数据框
Returns:
Dict: 指标字典
"""
metrics = {}
if 'dividend_rate' not in df.columns:
logger.warning("数据中未找到dividend_rate列")
return metrics
dividend_rates = df['dividend_rate']
# 基础统计指标
metrics['current_rate'] = float(dividend_rates.iloc[0]) if len(dividend_rates) > 0 else 0
metrics['avg_15d'] = float(dividend_rates.mean())
metrics['max_15d'] = float(dividend_rates.max())
metrics['min_15d'] = float(dividend_rates.min())
metrics['std_15d'] = float(dividend_rates.std())
# 计算PE指标(如果数据源包含)
if 'pe_ratio' in df.columns:
pe_values = df['pe_ratio'].replace([np.inf, -np.inf], np.nan).dropna()
if len(pe_values) > 0:
metrics['pe'] = float(pe_values.iloc[0])
metrics['pe_avg_15d'] = float(pe_values.mean())
# 计算PE历史分位数
pe_min = float(pe_values.min())
pe_max = float(pe_values.max())
if pe_max != pe_min:
metrics['pe_percentile'] = float((metrics['pe'] - pe_min) / (pe_max - pe_min) * 100)
else:
metrics['pe_percentile'] = 50.0
# 趋势指标
if len(dividend_rates) >= 2:
metrics['daily_change'] = float(dividend_rates.iloc[0] - dividend_rates.iloc[1])
metrics['change_percent'] = float((metrics['daily_change'] / dividend_rates.iloc[1]) * 100) if dividend_rates.iloc[1] != 0 else 0
# 相对位置指标
metrics['percentile_15d'] = float(
(dividend_rates.iloc[0] - metrics['min_15d']) /
(metrics['max_15d'] - metrics['min_15d']) * 100
) if metrics['max_15d'] != metrics['min_15d'] else 50
logger.debug(f"计算得到的指标: {metrics}")
return metrics
def get_trend_analysis(self, df: pd.DataFrame) -> str:
"""
获取趋势分析文本
Args:
df: 数据框
Returns:
str: 趋势分析描述
"""
if 'dividend_rate' not in df.columns or len(df) < 2:
return "数据不足,无法进行趋势分析"
current_rate = df['dividend_rate'].iloc[0]
previous_rate = df['dividend_rate'].iloc[1]
if current_rate > previous_rate:
trend = "上升"
elif current_rate < previous_rate:
trend = "下降"
else:
trend = "持平"
return f"股息率{trend},当前值为{current_rate:.4f}"
def _generate_investment_advice(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
"""
生成投资决策建议(简化版,专注股息率和国债收益率)
Args:
metrics: 指标字典
Returns:
Dict: 投资决策建议
"""
advice = {
'action': '持有', # 买入/持有/卖出
'confidence': 0.5, # 信心度 0-1
'reasons': [],
'risks': [],
'summary': ''
}
# 1. 股息率分析
dividend_rate = metrics.get('current_rate')
dividend_percentile = metrics.get('percentile_15d', 50)
if dividend_rate is not None:
if dividend_percentile > 70:
advice['reasons'].append(f"股息率处于历史高位(分位数{dividend_percentile:.1f}%)")
advice['confidence'] += 0.1
elif dividend_percentile < 30:
advice['reasons'].append(f"股息率处于历史低位(分位数{dividend_percentile:.1f}%)")
advice['confidence'] += 0.15
# 股息率趋势
daily_change = metrics.get('daily_change', 0)
if daily_change > 0.05:
advice['reasons'].append("股息率呈现上升趋势")
elif daily_change < -0.05:
advice['reasons'].append("股息率呈现下降趋势")
# 2. 国债收益率对比分析
bond_yield = metrics.get('bond_yield')
dividend_bond_spread = metrics.get('dividend_bond_spread')
if bond_yield is not None and dividend_rate is not None:
if dividend_bond_spread > 1.0:
advice['reasons'].append(f"股息率显著高于国债收益率(差额{ dividend_bond_spread:.2f}%)")
advice['confidence'] += 0.2
elif dividend_bond_spread < 0:
advice['risks'].append(f"股息率低于国债收益率(差额{ dividend_bond_spread:.2f}%)")
advice['confidence'] -= 0.15
else:
advice['reasons'].append(f"股息率与国债收益率基本相当(差额{ dividend_bond_spread:.2f}%)")
# 4. 综合决策
confidence = max(0, min(1, advice['confidence'])) # 限制在0-1之间
if confidence > 0.7:
advice['action'] = '买入'
advice['summary'] = '估值吸引,股息率有优势,建议买入'
elif confidence > 0.4:
advice['action'] = '持有'
advice['summary'] = '估值合理,建议持有观察'
else:
advice['action'] = '卖出'
advice['summary'] = '估值偏高或股息率无优势,建议卖出'
advice['confidence'] = confidence
# 如果没有具体原因,添加默认说明
if not advice['reasons']:
advice['reasons'].append("基于当前市场环境和历史数据分析")
if not advice['risks']:
advice['risks'].append("市场波动风险始终存在")
logger.info(f"投资决策生成: {advice['action']} (信心度: {confidence:.2f})")
return advice