deephaven · devinrsmith · Nov 11, 2025
diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -3,6 +3,7 @@
 import io.deephaven.csv.annotations.BuildableStyle;
 import io.deephaven.csv.parsers.Parser;
 import io.deephaven.csv.parsers.Parsers;
+import io.deephaven.csv.parsers.StringParser;
 import io.deephaven.csv.tokenization.JdkDoubleParser;
 import io.deephaven.csv.tokenization.Tokenizer;
 import io.deephaven.csv.tokenization.Tokenizer.CustomDoubleParser;
@@ -12,6 +13,7 @@
 import org.immutables.value.Value.Immutable;
 import org.jetbrains.annotations.Nullable;
 
+import java.nio.charset.Charset;
 import java.time.Duration;
 import java.util.*;
 import java.util.function.Function;
@@ -127,7 +129,8 @@ public interface Builder {
         Builder putNullValueLiteralsForIndex(int index, List<String> nullValueLiteral);
 
         /**
-         * The parser to uses when all values in the column are null. The default is {@link Parsers#STRING}.
+         * The parser to uses when all values in the column are null. The default is {@link StringParser#of(Charset)}
+         * with {@link CsvSpecs#charset()}.
          * 
          * @param parser The parser
          * @return self after modifying the parser property.
@@ -345,6 +348,14 @@ public interface Builder {
          */
         Builder threadShutdownTimeout(Duration timeout);
 
+        /**
+         * The charset to use for library built {@link StringParser}. By default, is {@link Charset#defaultCharset()}.
+         *
+         * @param charset the charset
+         * @return self after modifying the charset property.
+         */
+        Builder charset(Charset charset);
+
         /**
          * Build the CsvSpecs object.
          * 
@@ -458,13 +469,13 @@ public static CsvSpecs headerless() {
     public abstract Map<Integer, String> headerForIndex();
 
     /**
-     * See {@link Builder#parsers}.
+     * See {@link Builder#parsers}. Defaults to {@link Parsers#defaults(CsvSpecs)}.
      * 
      * @return The set of configured parsers.
      */
     @Default
     public List<Parser<?>> parsers() {
-        return Parsers.DEFAULT;
+        return Parsers.defaults(this);
     }
 
     /**
@@ -511,9 +522,8 @@ public List<String> nullValueLiterals() {
      * @return The parser to use when all values in the column are null.
      */
     @Default
-    @Nullable
     public Parser<?> nullParser() {
-        return Parsers.STRING;
+        return StringParser.of(charset());
     }
 
     /**
@@ -728,6 +738,16 @@ public Duration threadShutdownTimeout() {
         return defaultThreadShutdownTimeout;
     }
 
+    /**
+     * The Charset to use with library built {@link StringParser}. By default, is {@link Charset#defaultCharset()}.
+     *
+     * @return the charset
+     */
+    @Default
+    public Charset charset() {
+        return Charset.defaultCharset();
+    }
+
     private static void check7BitAscii(String what, char c, List<String> problems) {
         if (c > 0x7f) {
             final String message = String.format("%s is set to '%c' but is required to be 7-bit ASCII",

diff --git a/src/main/java/io/deephaven/csv/containers/ByteSlice.java b/src/main/java/io/deephaven/csv/containers/ByteSlice.java
@@ -2,6 +2,8 @@
 
 import org.jetbrains.annotations.NotNull;
 
+import java.nio.charset.Charset;
+
 /**
  * An object that represents a slice of byte data. This object is intended to be reusable. It implements
  * {@link CharSequence} because there are a few special cases (e.g. calling out to certain libraries) where it is
@@ -155,10 +157,25 @@ public CharSequence subSequence(final int start, final int end) {
         return new ByteSlice(data, newBegin, newEnd);
     }
 
+    /**
+     * Creates the string from this byte slice using {@link Charset#defaultCharset()}.
+     *
+     * @return the String
+     */
     @Override
     @NotNull
     public String toString() {
+        return toString(Charset.defaultCharset());
+    }
+
+    /**
+     * Creates the string from this byte slice using {@code charset}.
+     *
+     * @param charset the charset
+     * @return the String
+     */
+    public String toString(final Charset charset) {
         final int size = size();
-        return size == 0 ? "" : new String(data, begin, size);
+        return size == 0 ? "" : new String(data, begin, size, charset);
     }
 }
diff --git a/src/main/java/io/deephaven/csv/parsers/Parsers.java b/src/main/java/io/deephaven/csv/parsers/Parsers.java
@@ -1,5 +1,7 @@
 package io.deephaven.csv.parsers;
 
+import io.deephaven.csv.CsvSpecs;
+
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -74,6 +76,14 @@ public class Parsers {
      */
     public static final List<Parser<?>> DEFAULT = unmodifiable(BOOLEAN, INT, LONG, DOUBLE, DATETIME, CHAR, STRING);
 
+    /**
+     * Notably, BYTE, SHORT, and FLOAT are not in the list of standard parsers. The TIMESTAMP_* parsers are never
+     * included by default, because they look like ints/longs.
+     */
+    public static List<Parser<?>> defaults(final CsvSpecs specs) {
+        return Arrays.asList(BOOLEAN, INT, LONG, DOUBLE, DATETIME, CHAR, StringParser.of(specs.charset()));
+    }
+
     /**
      * The above plus BYTE. The TIMESTAMP_* parsers are never included by default, because they look like ints/longs.
      */

diff --git a/src/main/java/io/deephaven/csv/parsers/StringParser.java b/src/main/java/io/deephaven/csv/parsers/StringParser.java
@@ -4,14 +4,31 @@
 import io.deephaven.csv.util.CsvReaderException;
 import org.jetbrains.annotations.NotNull;
 
+import java.nio.charset.Charset;
+import java.util.Objects;
+
 /** The parser for the String type. */
 public final class StringParser implements Parser<String[]> {
     /**
-     * Singleton instance.
+     * Singleton instance with {@link Charset#defaultCharset()}.
      */
-    public static final StringParser INSTANCE = new StringParser();
+    public static final StringParser INSTANCE = new StringParser(Charset.defaultCharset());
 
-    private StringParser() {}
+    /**
+     * The String parser instance with {@code charset}.
+     *
+     * @param charset the charset
+     * @return the String parser
+     */
+    public static StringParser of(final Charset charset) {
+        return Charset.defaultCharset().equals(charset) ? INSTANCE : new StringParser(charset);
+    }
+
+    private final Charset charset;
+
+    private StringParser(Charset charset) {
+        this.charset = Objects.requireNonNull(charset);
+    }
 
     @NotNull
     @Override
@@ -50,7 +67,7 @@ public long tryParse(
                 nulls[chunkIndex++] = true;
                 continue;
             }
-            final String value = ih.bs().toString();
+            final String value = ih.bs().toString(charset);
             if (value.equals(reservedValue)) {
                 // If a reserved value is defined, it must not be present in the input.
                 break;

diff --git a/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java b/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java
@@ -20,7 +20,7 @@ public final class ParseDenseStorageToColumn {
     /**
      * @param colNum The column number being parsed. Some custom sinks use this for their own information.
      * @param dsr A reader for the input.
-     * @param parsers The set of parsers to try. If null, then {@link Parsers#DEFAULT} will be used.
+     * @param parsers The set of parsers to try.
      * @param specs The CsvSpecs which control how the column is interpreted.
      * @param nullValueLiteralsToUse If a cell text is equal to any of the values in this array, the cell will be
      *        interpreted as the null value. Typically set to a one-element array containing the empty string.
@@ -37,7 +37,7 @@ public static Result doit(
             final String[] nullValueLiteralsToUse,
             final SinkFactory sinkFactory)
             throws CsvReaderException {
-        Set<Parser<?>> parserSet = new HashSet<>(parsers != null ? parsers : Parsers.DEFAULT);
+        Set<Parser<?>> parserSet = new HashSet<>(parsers);
 
         final Tokenizer tokenizer = new Tokenizer(specs.customDoubleParser(), specs.customTimeZoneParser());
         final Parser.GlobalContext gctx =
@@ -98,7 +98,7 @@ public static Result doit(
 
         // The rest of this logic is for case 2: there is a non-null cell (so the type inference process can begin).
 
-        final CategorizedParsers cats = CategorizedParsers.create(parserSet);
+        final CategorizedParsers cats = CategorizedParsers.create(specs, parserSet);
 
         if (cats.customParser != null) {
             ih.reset();
@@ -112,15 +112,16 @@ public static Result doit(
             return parseNumerics(cats, gctx, ih.move(), ihAlt.move());
         }
 
-        List<Parser<?>> universeByPrecedence = Arrays.asList(Parsers.CHAR, Parsers.STRING);
+        final StringParser stringParser = StringParser.of(specs.charset());
+        List<Parser<?>> universeByPrecedence = Arrays.asList(Parsers.CHAR, stringParser);
         final MutableBoolean dummyBoolean = new MutableBoolean();
         final MutableLong dummyLong = new MutableLong();
         if (cats.timestampParser != null && tokenizer.tryParseLong(ih.get().bs(), dummyLong)) {
-            universeByPrecedence = Arrays.asList(cats.timestampParser, Parsers.CHAR, Parsers.STRING);
+            universeByPrecedence = Arrays.asList(cats.timestampParser, Parsers.CHAR, stringParser);
         } else if (cats.booleanParser != null && tokenizer.tryParseBoolean(ih.get().bs(), dummyBoolean)) {
-            universeByPrecedence = Arrays.asList(Parsers.BOOLEAN, Parsers.STRING);
+            universeByPrecedence = Arrays.asList(Parsers.BOOLEAN, stringParser);
         } else if (cats.dateTimeParser != null && tokenizer.tryParseDateTime(ih.get().bs(), dummyLong)) {
-            universeByPrecedence = Arrays.asList(Parsers.DATETIME, Parsers.STRING);
+            universeByPrecedence = Arrays.asList(Parsers.DATETIME, stringParser);
         }
         List<Parser<?>> parsersToUse = limitToSpecified(universeByPrecedence, parserSet);
         return parseFromList(parsersToUse, gctx, ih.move(), ihAlt.move());
@@ -391,7 +392,7 @@ public Failure(Moveable<IteratorHolder> ih, Moveable<IteratorHolder> ihAlt) {
     }
 
     private static class CategorizedParsers {
-        public static CategorizedParsers create(final Collection<Parser<?>> parsers)
+        public static CategorizedParsers create(final CsvSpecs specs, final Collection<Parser<?>> parsers)
                 throws CsvReaderException {
             Parser<?> booleanParser = null;
             final Set<Parser<?>> specifiedNumericParsers = new HashSet<>();
@@ -421,7 +422,7 @@ public static CategorizedParsers create(final Collection<Parser<?>> parsers)
                     continue;
                 }
 
-                if (p == Parsers.CHAR || p == Parsers.STRING) {
+                if (p == Parsers.CHAR || p instanceof StringParser) {
                     specifiedCharAndStringParsers.add(p);
                     continue;
                 }
@@ -472,7 +473,7 @@ public static CategorizedParsers create(final Collection<Parser<?>> parsers)
                             Parsers.FLOAT_STRICT,
                             Parsers.DOUBLE);
             final List<Parser<?>> allCharAndStringParsersByPrecedence =
-                    Arrays.asList(Parsers.CHAR, Parsers.STRING);
+                    Arrays.asList(Parsers.CHAR, StringParser.of(specs.charset()));
 
             final List<Parser<?>> numericParsers =
                     limitToSpecified(allNumericParsersByPrecedence, specifiedNumericParsers);

diff --git a/src/main/java/io/deephaven/csv/reading/headers/DelimitedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/DelimitedHeaderFinder.java
@@ -47,7 +47,7 @@ public static String[] determineHeadersToUse(final CsvSpecs specs,
                 }
                 --skipCount;
             }
-            headersToUse = Arrays.stream(headerRow).map(String::new).toArray(String[]::new);
+            headersToUse = Arrays.stream(headerRow).map(x -> new String(x, specs.charset())).toArray(String[]::new);
         }
 
         // Whether or not the input had headers, maybe override with client-specified headers.

diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
@@ -10,6 +10,7 @@
 import io.deephaven.csv.util.MutableInt;
 import io.deephaven.csv.util.MutableObject;
 
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -59,7 +60,7 @@ public static String[] determineHeadersToUse(
             }
 
             headersToUse =
-                    extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention());
+                    extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention(), specs.charset());
         } else {
             if (columnWidthsToUse.length == 0) {
                 throw new CsvReaderException(
@@ -135,7 +136,8 @@ private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingCo
      * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
      * @return The array of headers
      */
-    private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) {
+    private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode,
+            Charset charset) {
         final int numCols = columnWidths.length;
         if (numCols == 0) {
             return new String[0];
@@ -153,7 +155,7 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolea
             final int actualEndByte = Math.min(proposedEndByte, row.end());
             tempSlice.reset(row.data(), beginByte, actualEndByte);
             ReaderUtil.trimSpacesAndTabs(tempSlice);
-            result[colNum] = tempSlice.toString();
+            result[colNum] = tempSlice.toString(charset);
             beginByte = actualEndByte;
         }
         return result;

diff --git a/src/main/java/io/deephaven/csv/tokenization/JdkDoubleParser.java b/src/main/java/io/deephaven/csv/tokenization/JdkDoubleParser.java
@@ -3,6 +3,8 @@
 import io.deephaven.csv.containers.ByteSlice;
 import io.deephaven.csv.tokenization.Tokenizer.CustomDoubleParser;
 
+import java.nio.charset.StandardCharsets;
+
 /**
  * A {@link CustomDoubleParser} that uses {@link Double#parseDouble(String)}. Not actually an 'enum'. We use this as a
  * Java trick to get singletons.
@@ -21,7 +23,7 @@ public enum JdkDoubleParser implements CustomDoubleParser {
      */
     @Override
     public double parse(ByteSlice bs) throws NumberFormatException {
-        return Double.parseDouble(bs.toString());
+        return Double.parseDouble(bs.toString(StandardCharsets.US_ASCII));
     }
 
     /**

diff --git a/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java b/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java
@@ -4,6 +4,7 @@
 import io.deephaven.csv.reading.ReaderUtil;
 import io.deephaven.csv.util.*;
 
+import java.nio.charset.StandardCharsets;
 import java.time.*;
 import java.util.Iterator;
 import java.util.Objects;
@@ -164,7 +165,7 @@ public boolean tryParseLong(final ByteSlice bs, final MutableLong result) {
      */
     public boolean tryParseFloatStrict(final ByteSlice bs, final MutableFloat result) {
         try {
-            final float res = Float.parseFloat(bs.toString());
+            final float res = Float.parseFloat(bs.toString(StandardCharsets.US_ASCII));
             result.setValue(res);
             return true;
         } catch (NumberFormatException nfe) {