MiniData/CSV.h at master · htiek/MiniData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#ifndef MiniData_CSV_Included
#define MiniData_CSV_Included

#include <vector>
#include <unordered_map>
#include <string>
#include <istream>
#include <stdexcept>

/* Type representing data read from a CSV file containing a header row. Access is
 * provided as csv[row][column], where column can be specified either by an integer
 * or as one of the column headers.
 */
class CSV {
public:
    /* Parsing routines. */
    inline static CSV parse(std::istream& source);
    inline static CSV parseFile(const std::string& filename);

    /* Basic accessors. */
    inline std::size_t numRows() const;   // Doesn't include header
    inline std::size_t numCols() const;

    /* Header information. */
    inline std::vector<std::string> headers() const;

    /* Accessor proxy class. */
    class RowRef {
    public:
        inline std::string operator[] (std::size_t col) const;
        inline std::string operator[] (const std::string& colHeader) const;

    private:
        inline RowRef(const CSV* parent, std::size_t row);
        const CSV* mParent;
        std::size_t mRow;

        friend class CSV;
    };

    inline RowRef operator[] (std::size_t col) const;

private:
    /* The data. It's internally represented as a 2D grid of strings, along with
     * auxiliary column header data.
     *
     * The grid itself is represented in row-major order, and does not include the
     * column headers.
     */
    std::vector<std::string> mData;
    std::size_t              mRows;

    /* Column headers are encoded as a map from headers to indices, since the
     * primary operation we'll be supporting is mapping from a name to a column.
     */
    std::unordered_map<std::string, std::size_t> mColumnHeaders;
};

/* Type representing an error caused by a CSV issue. */
class CSVException: public std::logic_error {
public:
    inline CSVException(const std::string& message);
};


/* * * * * Implementation Below This Point * * * * */

#include <sstream>
#include <fstream>
#include <tuple>

namespace MiniData_CSVImpl {
    /* Reports an error. */
    [[ noreturn ]] inline void csvError(const std::string& message) {
        throw CSVException(message);
    }

    /* Reads a single CSV token from a source. Each token either
     *
     *  1. does not start with a quote, in which case we read up until the first comma, or
     *  2. starts with a quote, in which case we read to the upcoming close quote, watching for
     *     escaped quotes along the way.
     *
     * Empty entries are acceptable.
     */
    inline std::string readOneTokenFrom(std::istream& input) {
        /* Edge case: empty entries are fine. */
        if (input.peek() == ',') return "";

        /* If we don't start with a quote, read up until we do. */
        if (input.peek() != '"') {
            std::string result;
            while (true) {
                int ch = input.peek();
                if (ch == EOF) return result;
                if (ch == ',') return result;
                result += char(input.get());
            }
        }

        /* We are looking a quoted string. Keep reading characters, keeping in mind that a close
         * quote might not actually be the end-of-string marker.
         */
        input.get(); // Skip quotation mark

        std::string result;
        while (true) {
            int ch = input.get();

            if (ch == EOF) csvError("Unterminated string literal.");
            else if (ch != '"') result += char(ch);
            else {
                int next = input.peek();
                if (next == EOF || next == ',') return result; // End of token
                else if (next == '"') {
                    /* Consume this character so we don't process it twice. */
                    input.get();
                    result += '"';
                } else csvError("Unexpected character found after quote.");
            }
        }
    }

    /* Tokenizes a line from a CSV file, returning a list of tokens within that line. */
    inline std::vector<std::string> tokenize(const std::string& line) {
        /* Edge case: we assume there are no empty lines even though in principle we could
         * envision a 0 x n data array. That likely just means something went wrong.
         */
        if (line.empty()) csvError("Empty line in CSV data.");

        /* Convert to a stream to make it easier to treat the characters as though they're a stream. */
        std::istringstream input(line);

        std::vector<std::string> result;
        while (true) {
            result.push_back(readOneTokenFrom(input));

            /* We should either see a comma or an EOF at this point. */
            if (input.peek() == EOF) return result;
            if (input.get()  != ',') csvError("Entries in CSV file aren't comma-separated?");
        }
    }

    /* Reads the first line of a CSV file, breaking it apart into headers. */
    inline std::unordered_map<std::string, std::size_t> readHeaders(std::istream& input) {
        std::string line;
        if (!std::getline(input, line)) csvError("Could not read header row from CSV source.");

        std::unordered_map<std::string, std::size_t> result;
        for (auto token: tokenize(line)) {
            if (result.count(token)) csvError("Duplicate column header: " + token);

            std::size_t index = result.size();
            result[token] = index;
        }

        return result;
    }

    /* Reads the body of a CSV file under the assumption that it has a certain number of
     * columns.
     *
     * The result is a pairing of the row-major-ordering of the data, along with the
     * number of rows in the data.
     */
    inline std::tuple<std::vector<std::string>, std::size_t> readBody(std::istream& input, std::size_t numCols) {
        /* We'll build the grid as a vector<vector<string>> and collapse it at the end. */
        std::vector<std::vector<std::string>> lines;
        for (std::string line; std::getline(input, line); ) {
            auto tokens = tokenize(line);
            if (tokens.size() != numCols) csvError("Lines have varying number of entries.");

            lines.push_back(tokens);
        }

        /* Flatten the list. */
        std::vector<std::string> result;
        for (std::size_t row = 0; row < lines.size(); row++) {
            for (std::size_t col = 0; col < numCols; col++) {
                result.push_back(lines[row][col]);
            }
        }
        return std::make_tuple(result, lines.size());
    }
}

inline CSV CSV::parse(std::istream& input) {
    CSV result;

    result.mColumnHeaders = MiniData_CSVImpl::readHeaders(input);
    std::tie(result.mData, result.mRows) = MiniData_CSVImpl::readBody(input, result.mColumnHeaders.size());

    return result;
}

inline CSV CSV::parseFile(const std::string& filename) {
    std::ifstream input(filename);
    if (!input) MiniData_CSVImpl::csvError("Cannot open file " + filename);

    return CSV::parse(input);
}

inline size_t CSV::numRows() const {
    return mRows;
}

inline size_t CSV::numCols() const {
    return mColumnHeaders.size();
}

inline std::vector<std::string> CSV::headers() const {
    std::vector<std::string> result(mColumnHeaders.size());
    for (const auto& entry: mColumnHeaders) {
        result[entry.second] = entry.first;
    }
    return result;
}

inline CSV::RowRef CSV::operator[] (std::size_t row) const {
    if (row >= numRows()) MiniData_CSVImpl::csvError("Row out of range.");

    return RowRef(this, row);
}

inline CSV::RowRef::RowRef(const CSV* parent, std::size_t row) : mParent(parent), mRow(row) {

}

inline std::string CSV::RowRef::operator[] (std::size_t col) const {
    if (col >= mParent->numCols()) MiniData_CSVImpl::csvError("Column out of range.");

    return mParent->mData[mParent->numCols() * mRow + col];
}
inline std::string CSV::RowRef::operator[] (const std::string& colHeader) const {
    auto itr = mParent->mColumnHeaders.find(colHeader);
    if (itr == mParent->mColumnHeaders.end()) MiniData_CSVImpl::csvError("Column not found: " + colHeader);

    return (*this)[itr->second];
}

inline CSVException::CSVException(const std::string& message) : std::logic_error(message) {
    // Handled in initialization list
}

#endif