Unix-basement/ebird_basic_dataset_splitter.sh at master · PLJV/Unix-basement · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash

#
# Basic BASH script that will accept a ebd .txt file as a
# commandline argument and then split the file
# into 500,000 line CSV chunks. The output can then
# be processed piecewise in R
#
# Author : Kyle Taylor (kyle.taylor@pljv.org) [2017]
#
# edits : Initial build 8/10/2017
#

# find our split files
split_files=(`ls -1 x* | grep -v "x[.][.]"`)

if [[ ${#split_files[@]} > 1 ]]; then
  echo "-- using previous split files found in CWD"
else
  echo " -- generating split files"
  split $1 -l 500000
fi

# record the header and purge from first split
header=`head -n1 $1`

function header_is_present {
  if [[ `head -n1 $1` == $header ]]; then
    if [[ $2 == "chop" ]]; then
      echo " -- chopping header from first split file"
      rm -rf out1234
      tail -n +2 $1 >> out1234
      mv out1234 $1
    fi
    return 1
  else
    return 0
  fi
}

function add_header_to_split_files {
echo -n " -- processing:"
  for f in ${1[@]} ; do
    rm -rf $f"1"
    echo $2 >> $f"1"
    cat $f >> $f"1"
    mv $f"1" $f
    mv $f $f".csv"
    echo -n "."
  done
  echo "\n"
}

# process our chunks by adding a header to each split file

# the first split should have a lurking header at line n=1
header_is_present ${split_files[0]} "chop"
add_header_to_split_files $split_files $header

# compress everything for the user
rm -rf echo $1 | awk '{ gsub(".txt",""); print }'`"_processed.zip"
7za a `echo $1 | awk '{ gsub(".txt",""); print }'`"_processed.zip" ${split_files[@]}

rm -rf ${split_files[@]}