qsv/src/cmd/foreach.rs at master · dathere/qsv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
static USAGE: &str = r#"
Execute a shell command once per record in a given CSV file.

NOTE: Windows users are recommended to use Git Bash as their terminal when
running this command. Download it from https://git-scm.com/downloads. When installing,
be sure to select "Use Git from the Windows Command Prompt" to ensure that the
necessary Unix tools are available in the terminal.

WARNING: This command can be dangerous. Be careful when using it with
untrusted input.

Or per @thadguidry: 😉
Please ensure when using foreach to use trusted arguments, variables, scripts, etc.
If you don't do due diligence and blindly use untrusted parts... foreach can indeed
become a footgun and possibly fry your computer, eat your lunch, and expose an entire
datacenter to a cancerous virus in your unvetted batch file you grabbed from some
stranger on the internet that runs...FOR EACH LINE in your CSV file. GASP!"

Examples:

Delete all files whose filenames are listed in the filename column:

  $ qsv foreach filename 'rm {}' assets.csv

Execute a command that outputs CSV once per record without repeating headers:

  $ qsv foreach query --unify 'search --year 2020 {}' queries.csv > results.csv

Same as above but with an additional column containing the current value:

  $ qsv foreach query -u -c from_query 'search {}' queries.csv > results.csv

For more examples, see https://github.com/dathere/qsv/blob/master/tests/test_foreach.rs.

If any child command exits with a non-zero status, foreach finishes processing
all rows but then exits with a non-zero status of its own.

Usage:
    qsv foreach [options] <column> <command> [<input>]
    qsv foreach --help

foreach arguments:
    column      The column whose value is substituted into the command.
                Only a single column is accepted.
    command     The command to execute. Use "{}" to substitute the value
                of the current input file line. The command must be
                non-empty after whitespace trimming.
                If you need to execute multiple commands, use a shell
                script. See foreach_multiple_commands_with_shell_script()
                in tests/test_foreach.rs for an example.
    input       The CSV file to read. If not provided, will read from stdin.

foreach options:
    -u, --unify                If the output of the executed command is a CSV,
                               unify the result by skipping headers on each
                               subsequent command. Does not work when --dry-run is true.
                               The first child's CSV header row becomes canonical;
                               later children are expected to produce the same schema.
    -c, --new-column <name>    If unifying, add a new column with given name
                               and copying the value of the current input file line.
    --dry-run <file|boolean>   If set to true (the default for safety reasons), the commands are
                               sent to stdout instead of executing them.
                               If set to a file, the commands will be written to the specified
                               text file instead of executing them. The file is only created
                               after all flag validation succeeds, so a conflicting flag
                               combination will not truncate an existing file.
                               Only if set to false will the commands be actually executed.
                               [default: true]

Common options:
    -h, --help             Display this message
    -n, --no-headers       When set, the file will be considered to have no
                           headers.
    -d, --delimiter <arg>  The field delimiter for reading CSV data.
                           Must be a single character. (default: ,)
    -p, --progressbar      Show progress bars. Not valid for stdin.
"#;

#[cfg(target_family = "windows")]
use std::ffi::OsString;
#[cfg(target_family = "unix")]
use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
use std::{
    io::{self, BufReader, BufWriter, Write},
    process::{Command, Stdio},
};

#[cfg(feature = "feature_capable")]
use indicatif::{ProgressBar, ProgressDrawTarget};
use regex::bytes::{NoExpand, Regex};
use serde::Deserialize;

use crate::{
    CliResult,
    config::{Config, Delimiter},
    select::SelectColumns,
    util,
};

#[derive(Deserialize)]
struct Args {
    arg_column:       SelectColumns,
    arg_command:      String,
    arg_input:        Option<String>,
    flag_unify:       bool,
    flag_new_column:  Option<String>,
    flag_dry_run:     String,
    flag_no_headers:  bool,
    flag_delimiter:   Option<Delimiter>,
    flag_progressbar: bool,
}

/// Strip outer matching quotes if present. The splitter regex guarantees that
/// quoted tokens have the same opening and closing quote character, so a
/// one-byte check on each end is enough — no second regex pass.
fn strip_outer_quotes(bytes: &[u8]) -> &[u8] {
    if bytes.len() >= 2 {
        let first = bytes[0];
        if matches!(first, b'"' | b'\'' | b'`') && bytes[bytes.len() - 1] == first {
            return &bytes[1..bytes.len() - 1];
        }
    }
    bytes
}

enum DryRun {
    /// dry-run output goes to stdout (the default).
    Stdout,
    /// dry-run output is written to the given file.
    File(String),
    /// not a dry run; child commands are actually executed.
    Disabled,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
    let args: Args = util::get_args(USAGE, argv)?;

    if args.arg_command.trim().is_empty() {
        return fail_incorrectusage_clierror!("foreach: <command> cannot be empty");
    }

    let dry_run = match args.flag_dry_run.as_str() {
        s if s.eq_ignore_ascii_case("true") => DryRun::Stdout,
        s if s.eq_ignore_ascii_case("false") => DryRun::Disabled,
        file_str => DryRun::File(file_str.to_string()),
    };
    let is_dry_run = !matches!(dry_run, DryRun::Disabled);

    // Validate flag combinations BEFORE any side effects (file creation, etc.)
    // so a conflicting --dry-run=file --unify never truncates the user's file.
    if is_dry_run && args.flag_unify {
        return fail_incorrectusage_clierror!("Cannot use --unify with --dry-run");
    }
    if args.flag_new_column.is_some() && !args.flag_unify {
        return fail_incorrectusage_clierror!("Cannot use --new-column without --unify");
    }

    let rconfig = Config::new(args.arg_input.as_ref())
        .delimiter(args.flag_delimiter)
        .no_headers_flag(args.flag_no_headers)
        .select(args.arg_column);

    let mut rdr = rconfig.reader()?;
    let mut wtr = Config::new(None).writer()?;

    let headers = rdr.byte_headers()?.clone();
    let sel = rconfig.selection(&headers)?;
    if sel.len() > 1 {
        return fail_incorrectusage_clierror!(
            "foreach accepts a single column; got {} columns",
            sel.len()
        );
    }
    let Some(&column_index) = sel.iter().next() else {
        return fail_incorrectusage_clierror!("foreach: no input column selected");
    };

    // template_pattern matches `{}` substitution markers in the user's command.
    #[allow(clippy::trivial_regex)]
    let template_pattern = Regex::new(r"\{\}")?;

    // splitter_pattern tokenises the substituted command. It matches either:
    //   - a sequence of word-like characters (a-z, A-Z, 0-9, _, ., +, /, -), or
    //   - a double-quoted, single-quoted, or backtick-quoted string.
    // It does not handle escaped quotes — for anything fancier, users should
    // wrap the command in a shell script.
    let splitter_pattern = Regex::new(r#"(?:[a-zA-Z0-9_.+/-]+|"[^"]*"|'[^']*'|`[^`]*`)"#)?;

    // Open the dry-run sink only AFTER all flag validation has run, so a
    // user-supplied dry-run file is never truncated for a command that was
    // about to error out anyway.
    let mut dry_run_file: Box<dyn Write> = match &dry_run {
        DryRun::Stdout => Box::new(BufWriter::new(io::stdout())),
        DryRun::File(path) => match std::fs::File::create(path) {
            Ok(f) => Box::new(BufWriter::new(f)),
            Err(e) => {
                return fail_incorrectusage_clierror!("Error creating dry-run file '{path}': {e}");
            },
        },
        DryRun::Disabled => Box::new(io::sink()),
    };

    let mut record = csv::ByteRecord::new();
    let mut output_headers_written = false;

    // prep progress bar
    #[cfg(feature = "feature_capable")]
    let show_progress =
        (args.flag_progressbar || util::get_envvar_flag("QSV_PROGRESSBAR")) && !rconfig.is_stdin();
    #[cfg(feature = "feature_capable")]
    let progress = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr_with_hz(5));
    #[cfg(feature = "feature_capable")]
    if show_progress {
        util::prep_progress(&progress, util::count_rows(&rconfig)?);
    } else {
        progress.set_draw_target(ProgressDrawTarget::hidden());
    }

    let mut row_idx: u64 = 0;
    let mut any_child_failed = false;

    while rdr.read_byte_record(&mut record)? {
        row_idx += 1;
        #[cfg(feature = "feature_capable")]
        if show_progress {
            progress.inc(1);
        }
        let current_value = &record[column_index];

        // replace_all returns a Cow<[u8]> that lives only for this iteration —
        // no per-row allocation when there are no `{}` markers, and otherwise a
        // single owned buffer that's dropped at end of iteration.
        // NoExpand makes the replacement byte-for-byte literal — without it,
        // a CSV value containing `$1`, `$$`, etc. would be interpreted as a
        // capture-group reference and mangled.
        let templated_command =
            template_pattern.replace_all(args.arg_command.as_bytes(), NoExpand(current_value));

        let mut command_pieces = splitter_pattern.find_iter(&templated_command);

        let Some(prog_match) = command_pieces.next() else {
            // Empty post-substitution command — treat the same as a non-zero
            // child exit so we honour the "finish all rows, then exit non-zero"
            // contract instead of bailing mid-stream.
            eprintln!("foreach: row {row_idx} command is empty after substitution; skipping");
            any_child_failed = true;
            continue;
        };

        let prog_bytes = strip_outer_quotes(prog_match.as_bytes());
        #[cfg(target_family = "unix")]
        let prog = OsStr::from_bytes(prog_bytes);
        #[cfg(target_family = "windows")]
        let prog = match simdutf8::basic::from_utf8(prog_bytes) {
            Ok(s) => OsString::from(s),
            Err(_) => {
                return fail_clierror!("foreach: program path contains invalid UTF-8");
            },
        };

        let cmd_args: Vec<String> = command_pieces
            .map(|piece| {
                simdutf8::basic::from_utf8(strip_outer_quotes(piece.as_bytes()))
                    .unwrap_or_default()
                    .to_string()
            })
            .collect();

        if is_dry_run {
            #[cfg(target_family = "unix")]
            let prog_str = simdutf8::basic::from_utf8(prog.as_bytes()).unwrap_or_default();
            #[cfg(target_family = "windows")]
            let prog_str = simdutf8::basic::from_utf8(prog.as_encoded_bytes()).unwrap_or_default();
            let cmd_args_string = cmd_args.join(" ");
            dry_run_file.write_all(format!("{prog_str} {cmd_args_string}\n").as_bytes())?;
            continue;
        }

        let status = if args.flag_unify {
            let mut cmd = Command::new(prog)
                .args(cmd_args)
                .stdout(Stdio::piped())
                .stderr(Stdio::inherit())
                .spawn()?;

            {
                let stdout = cmd.stdout.as_mut().unwrap();
                let stdout_reader = BufReader::new(stdout);

                let mut stdout_rdr = csv::ReaderBuilder::new()
                    .delimiter(match &args.flag_delimiter {
                        Some(delimiter) => delimiter.as_byte(),
                        None => b',',
                    })
                    .has_headers(true)
                    .from_reader(stdout_reader);

                let mut output_record = csv::ByteRecord::new();

                if !output_headers_written {
                    // Headers from the first child command's CSV output become
                    // canonical for the unified stream — subsequent commands
                    // are expected to produce CSVs with the same schema.
                    let mut headers = stdout_rdr.byte_headers()?.clone();

                    if let Some(name) = &args.flag_new_column {
                        headers.push_field(name.as_bytes());
                    }

                    wtr.write_byte_record(&headers)?;
                    output_headers_written = true;
                }

                while stdout_rdr.read_byte_record(&mut output_record)? {
                    if args.flag_new_column.is_some() {
                        output_record.push_field(current_value);
                    }

                    wtr.write_byte_record(&output_record)?;
                }
            }

            cmd.wait()?
        } else {
            let mut cmd = Command::new(prog)
                .args(cmd_args)
                .stdout(Stdio::inherit())
                .stderr(Stdio::inherit())
                .spawn()?;

            cmd.wait()?
        };

        if !status.success() {
            eprintln!(
                "foreach: row {row_idx} command failed (exit {})",
                status
                    .code()
                    .map_or_else(|| "signal".to_string(), |c| c.to_string())
            );
            any_child_failed = true;
        }
    }
    #[cfg(feature = "feature_capable")]
    if show_progress {
        util::finish_progress(&progress);
    }
    dry_run_file.flush()?;
    wtr.flush()?;

    if any_child_failed {
        return fail_clierror!("foreach: one or more child commands exited with non-zero status");
    }
    Ok(())
}