Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io.deephaven.csv.annotations.BuildableStyle;
import io.deephaven.csv.parsers.Parser;
import io.deephaven.csv.parsers.Parsers;
import io.deephaven.csv.parsers.StringParser;
import io.deephaven.csv.tokenization.JdkDoubleParser;
import io.deephaven.csv.tokenization.Tokenizer;
import io.deephaven.csv.tokenization.Tokenizer.CustomDoubleParser;
Expand All @@ -12,6 +13,7 @@
import org.immutables.value.Value.Immutable;
import org.jetbrains.annotations.Nullable;

import java.nio.charset.Charset;
import java.time.Duration;
import java.util.*;
import java.util.function.Function;
Expand Down Expand Up @@ -127,7 +129,8 @@ public interface Builder {
Builder putNullValueLiteralsForIndex(int index, List<String> nullValueLiteral);

/**
* The parser to uses when all values in the column are null. The default is {@link Parsers#STRING}.
* The parser to uses when all values in the column are null. The default is {@link StringParser#of(Charset)}
* with {@link CsvSpecs#charset()}.
*
* @param parser The parser
* @return self after modifying the parser property.
Expand Down Expand Up @@ -345,6 +348,14 @@ public interface Builder {
*/
Builder threadShutdownTimeout(Duration timeout);

/**
* The charset to use for library built {@link StringParser}. By default, is {@link Charset#defaultCharset()}.
*
* @param charset the charset
* @return self after modifying the charset property.
*/
Builder charset(Charset charset);

/**
* Build the CsvSpecs object.
*
Expand Down Expand Up @@ -458,13 +469,13 @@ public static CsvSpecs headerless() {
public abstract Map<Integer, String> headerForIndex();

/**
* See {@link Builder#parsers}.
* See {@link Builder#parsers}. Defaults to {@link Parsers#defaults(CsvSpecs)}.
*
* @return The set of configured parsers.
*/
@Default
public List<Parser<?>> parsers() {
return Parsers.DEFAULT;
return Parsers.defaults(this);
}

/**
Expand Down Expand Up @@ -511,9 +522,8 @@ public List<String> nullValueLiterals() {
* @return The parser to use when all values in the column are null.
*/
@Default
@Nullable
public Parser<?> nullParser() {
return Parsers.STRING;
return StringParser.of(charset());
}

/**
Expand Down Expand Up @@ -728,6 +738,16 @@ public Duration threadShutdownTimeout() {
return defaultThreadShutdownTimeout;
}

/**
* The Charset to use with library built {@link StringParser}. By default, is {@link Charset#defaultCharset()}.
*
* @return the charset
*/
@Default
public Charset charset() {
return Charset.defaultCharset();
}

private static void check7BitAscii(String what, char c, List<String> problems) {
if (c > 0x7f) {
final String message = String.format("%s is set to '%c' but is required to be 7-bit ASCII",
Expand Down
19 changes: 18 additions & 1 deletion src/main/java/io/deephaven/csv/containers/ByteSlice.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;

/**
* An object that represents a slice of byte data. This object is intended to be reusable. It implements
* {@link CharSequence} because there are a few special cases (e.g. calling out to certain libraries) where it is
Expand Down Expand Up @@ -155,10 +157,25 @@ public CharSequence subSequence(final int start, final int end) {
return new ByteSlice(data, newBegin, newEnd);
}

/**
* Creates the string from this byte slice using {@link Charset#defaultCharset()}.
*
* @return the String
*/
@Override
@NotNull
public String toString() {
return toString(Charset.defaultCharset());
}

/**
* Creates the string from this byte slice using {@code charset}.
*
* @param charset the charset
* @return the String
*/
public String toString(final Charset charset) {
final int size = size();
return size == 0 ? "" : new String(data, begin, size);
return size == 0 ? "" : new String(data, begin, size, charset);
}
}
10 changes: 10 additions & 0 deletions src/main/java/io/deephaven/csv/parsers/Parsers.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package io.deephaven.csv.parsers;

import io.deephaven.csv.CsvSpecs;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -74,6 +76,14 @@ public class Parsers {
*/
public static final List<Parser<?>> DEFAULT = unmodifiable(BOOLEAN, INT, LONG, DOUBLE, DATETIME, CHAR, STRING);

/**
* Notably, BYTE, SHORT, and FLOAT are not in the list of standard parsers. The TIMESTAMP_* parsers are never
* included by default, because they look like ints/longs.
*/
public static List<Parser<?>> defaults(final CsvSpecs specs) {
return Arrays.asList(BOOLEAN, INT, LONG, DOUBLE, DATETIME, CHAR, StringParser.of(specs.charset()));
}

/**
* The above plus BYTE. The TIMESTAMP_* parsers are never included by default, because they look like ints/longs.
*/
Expand Down
25 changes: 21 additions & 4 deletions src/main/java/io/deephaven/csv/parsers/StringParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,31 @@
import io.deephaven.csv.util.CsvReaderException;
import org.jetbrains.annotations.NotNull;

import java.nio.charset.Charset;
import java.util.Objects;

/** The parser for the String type. */
public final class StringParser implements Parser<String[]> {
/**
* Singleton instance.
* Singleton instance with {@link Charset#defaultCharset()}.
*/
public static final StringParser INSTANCE = new StringParser();
public static final StringParser INSTANCE = new StringParser(Charset.defaultCharset());

private StringParser() {}
/**
* The String parser instance with {@code charset}.
*
* @param charset the charset
* @return the String parser
*/
public static StringParser of(final Charset charset) {
return Charset.defaultCharset().equals(charset) ? INSTANCE : new StringParser(charset);
}

private final Charset charset;

private StringParser(Charset charset) {
this.charset = Objects.requireNonNull(charset);
}

@NotNull
@Override
Expand Down Expand Up @@ -50,7 +67,7 @@ public long tryParse(
nulls[chunkIndex++] = true;
continue;
}
final String value = ih.bs().toString();
final String value = ih.bs().toString(charset);
if (value.equals(reservedValue)) {
// If a reserved value is defined, it must not be present in the input.
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public final class ParseDenseStorageToColumn {
/**
* @param colNum The column number being parsed. Some custom sinks use this for their own information.
* @param dsr A reader for the input.
* @param parsers The set of parsers to try. If null, then {@link Parsers#DEFAULT} will be used.
* @param parsers The set of parsers to try.
* @param specs The CsvSpecs which control how the column is interpreted.
* @param nullValueLiteralsToUse If a cell text is equal to any of the values in this array, the cell will be
* interpreted as the null value. Typically set to a one-element array containing the empty string.
Expand All @@ -37,7 +37,7 @@ public static Result doit(
final String[] nullValueLiteralsToUse,
final SinkFactory sinkFactory)
throws CsvReaderException {
Set<Parser<?>> parserSet = new HashSet<>(parsers != null ? parsers : Parsers.DEFAULT);
Set<Parser<?>> parserSet = new HashSet<>(parsers);

final Tokenizer tokenizer = new Tokenizer(specs.customDoubleParser(), specs.customTimeZoneParser());
final Parser.GlobalContext gctx =
Expand Down Expand Up @@ -98,7 +98,7 @@ public static Result doit(

// The rest of this logic is for case 2: there is a non-null cell (so the type inference process can begin).

final CategorizedParsers cats = CategorizedParsers.create(parserSet);
final CategorizedParsers cats = CategorizedParsers.create(specs, parserSet);

if (cats.customParser != null) {
ih.reset();
Expand All @@ -112,15 +112,16 @@ public static Result doit(
return parseNumerics(cats, gctx, ih.move(), ihAlt.move());
}

List<Parser<?>> universeByPrecedence = Arrays.asList(Parsers.CHAR, Parsers.STRING);
final StringParser stringParser = StringParser.of(specs.charset());
List<Parser<?>> universeByPrecedence = Arrays.asList(Parsers.CHAR, stringParser);
final MutableBoolean dummyBoolean = new MutableBoolean();
final MutableLong dummyLong = new MutableLong();
if (cats.timestampParser != null && tokenizer.tryParseLong(ih.get().bs(), dummyLong)) {
universeByPrecedence = Arrays.asList(cats.timestampParser, Parsers.CHAR, Parsers.STRING);
universeByPrecedence = Arrays.asList(cats.timestampParser, Parsers.CHAR, stringParser);
} else if (cats.booleanParser != null && tokenizer.tryParseBoolean(ih.get().bs(), dummyBoolean)) {
universeByPrecedence = Arrays.asList(Parsers.BOOLEAN, Parsers.STRING);
universeByPrecedence = Arrays.asList(Parsers.BOOLEAN, stringParser);
} else if (cats.dateTimeParser != null && tokenizer.tryParseDateTime(ih.get().bs(), dummyLong)) {
universeByPrecedence = Arrays.asList(Parsers.DATETIME, Parsers.STRING);
universeByPrecedence = Arrays.asList(Parsers.DATETIME, stringParser);
}
List<Parser<?>> parsersToUse = limitToSpecified(universeByPrecedence, parserSet);
return parseFromList(parsersToUse, gctx, ih.move(), ihAlt.move());
Expand Down Expand Up @@ -391,7 +392,7 @@ public Failure(Moveable<IteratorHolder> ih, Moveable<IteratorHolder> ihAlt) {
}

private static class CategorizedParsers {
public static CategorizedParsers create(final Collection<Parser<?>> parsers)
public static CategorizedParsers create(final CsvSpecs specs, final Collection<Parser<?>> parsers)
throws CsvReaderException {
Parser<?> booleanParser = null;
final Set<Parser<?>> specifiedNumericParsers = new HashSet<>();
Expand Down Expand Up @@ -421,7 +422,7 @@ public static CategorizedParsers create(final Collection<Parser<?>> parsers)
continue;
}

if (p == Parsers.CHAR || p == Parsers.STRING) {
if (p == Parsers.CHAR || p instanceof StringParser) {
specifiedCharAndStringParsers.add(p);
continue;
}
Expand Down Expand Up @@ -472,7 +473,7 @@ public static CategorizedParsers create(final Collection<Parser<?>> parsers)
Parsers.FLOAT_STRICT,
Parsers.DOUBLE);
final List<Parser<?>> allCharAndStringParsersByPrecedence =
Arrays.asList(Parsers.CHAR, Parsers.STRING);
Arrays.asList(Parsers.CHAR, StringParser.of(specs.charset()));

final List<Parser<?>> numericParsers =
limitToSpecified(allNumericParsersByPrecedence, specifiedNumericParsers);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public static String[] determineHeadersToUse(final CsvSpecs specs,
}
--skipCount;
}
headersToUse = Arrays.stream(headerRow).map(String::new).toArray(String[]::new);
headersToUse = Arrays.stream(headerRow).map(x -> new String(x, specs.charset())).toArray(String[]::new);
}

// Whether or not the input had headers, maybe override with client-specified headers.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import io.deephaven.csv.util.MutableInt;
import io.deephaven.csv.util.MutableObject;

import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -59,7 +60,7 @@ public static String[] determineHeadersToUse(
}

headersToUse =
extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention());
extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention(), specs.charset());
} else {
if (columnWidthsToUse.length == 0) {
throw new CsvReaderException(
Expand Down Expand Up @@ -135,7 +136,8 @@ private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingCo
* @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
* @return The array of headers
*/
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) {
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode,
Charset charset) {
final int numCols = columnWidths.length;
if (numCols == 0) {
return new String[0];
Expand All @@ -153,7 +155,7 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolea
final int actualEndByte = Math.min(proposedEndByte, row.end());
tempSlice.reset(row.data(), beginByte, actualEndByte);
ReaderUtil.trimSpacesAndTabs(tempSlice);
result[colNum] = tempSlice.toString();
result[colNum] = tempSlice.toString(charset);
beginByte = actualEndByte;
}
return result;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import io.deephaven.csv.containers.ByteSlice;
import io.deephaven.csv.tokenization.Tokenizer.CustomDoubleParser;

import java.nio.charset.StandardCharsets;

/**
* A {@link CustomDoubleParser} that uses {@link Double#parseDouble(String)}. Not actually an 'enum'. We use this as a
* Java trick to get singletons.
Expand All @@ -21,7 +23,7 @@ public enum JdkDoubleParser implements CustomDoubleParser {
*/
@Override
public double parse(ByteSlice bs) throws NumberFormatException {
return Double.parseDouble(bs.toString());
return Double.parseDouble(bs.toString(StandardCharsets.US_ASCII));
}

/**
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/io/deephaven/csv/tokenization/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io.deephaven.csv.reading.ReaderUtil;
import io.deephaven.csv.util.*;

import java.nio.charset.StandardCharsets;
import java.time.*;
import java.util.Iterator;
import java.util.Objects;
Expand Down Expand Up @@ -164,7 +165,7 @@ public boolean tryParseLong(final ByteSlice bs, final MutableLong result) {
*/
public boolean tryParseFloatStrict(final ByteSlice bs, final MutableFloat result) {
try {
final float res = Float.parseFloat(bs.toString());
final float res = Float.parseFloat(bs.toString(StandardCharsets.US_ASCII));
result.setValue(res);
return true;
} catch (NumberFormatException nfe) {
Expand Down
Loading
Loading