-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHtmlToCsvConverter.cs
More file actions
158 lines (132 loc) · 5.29 KB
/
HtmlToCsvConverter.cs
File metadata and controls
158 lines (132 loc) · 5.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
using System.Text;
using HtmlAgilityPack;
using HtmlDocument = HtmlAgilityPack.HtmlDocument;
namespace AlexaToExcel
{
class HtmlToCsvConverter
{
/// <summary>
/// Reads all <table> elements from the given HTML file and writes them
/// into a single CSV file. If the HTML is a Google Drive / Sheets wrapper
/// that loads data via an iframe, the converter automatically follows the
/// iframe src into the companion <c>_files</c> folder.
/// </summary>
public static int Convert(string htmlPath, string csvPath)
{
if (!File.Exists(htmlPath))
{
throw new FileNotFoundException($"HTML file not found: {htmlPath}");
}
// Resolve the actual data file (Google Drive saves data in a _files subfolder)
string resolvedPath = ResolveDataFile(htmlPath);
var doc = new HtmlDocument();
doc.Load(resolvedPath, Encoding.UTF8);
// Google Sheets uses <table class="waffle"> for the main data table
var tables = doc.DocumentNode.SelectNodes("//table[contains(@class,'waffle')]")
?? doc.DocumentNode.SelectNodes("//table");
if (tables == null || tables.Count == 0)
{
throw new InvalidOperationException("No <table> elements found in the HTML file.");
}
int totalRows = 0;
using var writer = new StreamWriter(csvPath, false, new UTF8Encoding(true));
for (int t = 0; t < tables.Count; t++)
{
if (t > 0)
{
writer.WriteLine(); // blank line between tables
}
var table = tables[t];
// Collect all rows from <tbody> (skip <thead> — Google Sheets puts only
// column-width shims there, not real headers)
var bodyRows = table.SelectNodes(".//tbody/tr")
?? table.SelectNodes(".//tr");
if (bodyRows == null)
{
continue;
}
foreach (var row in bodyRows)
{
// Skip freezebar / separator rows injected by Google Sheets
if (row.InnerHtml.Contains("freezebar-cell", StringComparison.OrdinalIgnoreCase))
{
continue;
}
// Only pick up <td> cells — skip <th> row-header cells (row numbers)
var cells = row.SelectNodes("./td");
if (cells == null || cells.Count == 0)
{
continue;
}
var values = new List<string>();
foreach (var cell in cells)
{
values.Add(EscapeCsvField(GetCellText(cell)));
}
writer.WriteLine(string.Join(",", values));
totalRows++;
}
}
return totalRows;
}
/// <summary>
/// If the HTML file is a Google Drive wrapper that uses an iframe to load
/// the actual spreadsheet data, return the path to the inner HTML file.
/// Otherwise return the original path unchanged.
/// </summary>
private static string ResolveDataFile(string htmlPath)
{
var doc = new HtmlDocument();
doc.Load(htmlPath, Encoding.UTF8);
var iframe = doc.DocumentNode.SelectSingleNode("//iframe[@src]");
if (iframe == null)
{
return htmlPath;
}
string src = iframe.GetAttributeValue("src", "");
if (string.IsNullOrWhiteSpace(src))
{
return htmlPath;
}
// The src is relative (e.g. "./Name_files/sheet.html")
string dir = Path.GetDirectoryName(htmlPath) ?? ".";
string candidate = Path.GetFullPath(Path.Combine(dir, src));
if (File.Exists(candidate))
{
Console.WriteLine($" Resolved iframe → {candidate}");
return candidate;
}
return htmlPath;
}
private static string GetCellText(HtmlNode cell)
{
// Prefer link text when the cell contains an <a> element
var link = cell.SelectSingleNode(".//a");
string raw;
if (link != null)
{
raw = link.InnerText ?? "";
}
else
{
raw = cell.InnerText ?? "";
}
// Decode HTML entities and collapse whitespace
var text = HtmlEntity.DeEntitize(raw);
text = text.Replace("\r", " ").Replace("\n", " ");
while (text.Contains(" "))
{
text = text.Replace(" ", " ");
}
return text.Trim();
}
private static string EscapeCsvField(string field)
{
if (field.Contains(',') || field.Contains('"') || field.Contains('\n') || field.Contains('\r'))
{
return "\"" + field.Replace("\"", "\"\"") + "\"";
}
return field;
}
}
}