Project1/runner.sh at master · cmucloudcourse/Project1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#!/bin/bash

################################################################################
##                       Cloud Computing Course                               ##
##                    Runner Script for Project 1                             ##
##                                                                            ##
##            Copyright 2018-2019: Cloud Computing Course                     ##
##                     Carnegie Mellon University                             ##
##   Unauthorized distribution of copyrighted material, including             ##
##  unauthorized peer-to-peer file sharing, may subject the students          ##
##  to the fullest extent of Carnegie Mellon University copyright policies.   ##
################################################################################

################################################################################
##                      README Before You Start                               ##
################################################################################
# Fill in the functions below for each question.
# You may use any programming language(s) in any question.
# You may use other files or scripts in these functions as long as they are in
# the submission folder.
# All files MUST include source code (e.g. do not just submit jar or pyc files).
#
# We will suggest tools or libraries in each question to enrich your learning.
# You are allowed to solve questions without the recommended tools or libraries.
#
# The colon `:` is a POSIX built-in basically equivalent to the `true` command,
# REPLACE it with your own command in each function.
# Before you fill your solution,
# DO NOT remove the colon or the function will break because the bash functions
# may not be empty!


################################################################################
##                          Setup & Cleanup                                   ##
################################################################################

setup() {
  # Fill in this helper function to do any setup if you need to.
  #
  # This function will be executed once at the beginning of the grading process.
  # Other functions may be executed repeatedly in arbitrary order.
  # Make use of this function to reduce unnecessary overhead and to make your
  # code behave consistently.
  #
  # e.g. You should compile Java code in this function.
  #
  # However, DO NOT use this function to solve any question or
  # generate any intermediate output.
  #
  # Examples:
  # mvn clean package
  # pip3 install -r requirements.txt
  #
  # Standard output format:
  # No standard output needed
  :
}

cleanup() {
  # Fill this helper function to clean up if you need to.
  #
  # This function will be executed once at the end of the grading process
  # Other functions might be executed repeatedly in arbitrary order.
  #
  # Examples:
  # mvn clean
  #
  # Standard output format:
  # No standard output needed
  :
}

################################################################################
##                        Data Pre-processing                                 ##
################################################################################

filter() {
  # (45 points)
  #
  # Question:
  # Fill this function to filter the input stream read from the dataset `pageviews-20161109-000000.gz`
  # and redirect the output stream to a file called `output`.
  # You need to read the gz file directly as the input stream, instead of
  # uncompressing and then reading it.
  # You need to redirect the standard output stream to a file named `output`,
  # instead of using File I/O.
  #
  # One possible approach is to use `zcat` to uncompress the gz file to StdOut
  # and pipe it to your data filter program.
  #
  # Examples:
  # zcat pageviews-20161109-000000.gz | python data_filter.py
  # zcat pageviews-20161109-000000.gz | java -cp project1.jar edu.cmu.scs.cc.project1.DataFilter
  #
  # Hint for Java users:
  # DO NOT compile java code in this function,
  # you should put the `mvn` or `javac` command in the setup() function
  #
  # Standard output format:
  # No standard output needed
  #
  # Note: the program must be encoding aware
  :
}

################################################################################
##                              MapReduce                                     ##
################################################################################

# Hint:
# As we demonstrated in the video "Writing a Streaming Program in Java for
# Hadoop Streaming" and the section "Test locally with a Small Dataset", your
# mapper and reducer should execute correctly with the following piping process.
#
# # map phase
# for f in $*; do
#   export mapreduce_map_input_file=$f
#   zcat $f | ./runner.sh -r map >> mapout.$$
# done
# # reduce phase
# LC_ALL=C sort -k1,1i mapout.$$ | ./runner.sh -r reduce >> mapred_output
#
# Fill the following functions to your mapper and reducer which accept input
# from StdIn and provide output through Stdout.

map() {
  # Fill this function to run your mapper.
  #
  # Examples:
  # python mapper.py
  # java -cp project1.jar edu.cmu.scs.cc.project1.Mapper
  #
  # Hint for Java users:
  # DO NOT compile java code in this function, you should put the `javac` or
  # `mvn` command in the setup() function instead
  #
  # Standard input format:
  # `domain_code page_title count_views total_response_size`
  # Standard output format:
  # Tab-separated key-value pairs based on your design
  #
  # Note: the program must be encoding aware
  :
}

reduce() {
  # Fill this function to run your reducer.
  #
  # Examples:
  # python reducer.py
  # java -cp project1.jar edu.cmu.scs.cc.project1.Reducer
  #
  # Hint for Java users:
  # DO NOT compile java code in this function, you should put the `javac` or
  # `mvn` command in the setup() function instead
  #
  # Standard input format:
  # Tab-separated key-value pairs generated by your mapper
  # Standard output format:
  # monthly_view  page_title daily_view ... daily_view (i.e. in TSV format)
  #
  # Note: the program must be encoding aware
  :
}

################################################################################
##                           Data Analysis                                    ##
################################################################################
# In the following sections, you should write a function to compute the answer
# and print it to StdOut for each question.
#
# Finish the related primers first if you have not done so, as they will help
# you to get well prepared for the questions.
#
# WARNING:
# Treat every function as an independent and standalone program.
# Any question below should not depend on the result or the intermediate
# output from another.

q1() {
  # (5 points) read a TSV file to a DataFrame and calculate descriptive
  # statistics
  #
  # Question:
  # Print column-wise descriptive statistics of the output file.
  #
  # 1. the statistics metrics required are as follows (in a tabular view):
  #       monthly_view  daily_view  daily_view ...  daily_view
  # count     ...           ...         ...             ...
  # mean      ...           ...         ...             ...
  # std       ...           ...         ...             ...
  # min       ...           ...         ...             ...
  # 25%       ...           ...         ...             ...
  # 50%       ...           ...         ...             ...
  # 75%       ...           ...         ...             ...
  # max       ...           ...         ...             ...
  # 2. the statistics result should be in CSV format
  # 3. format each value to 2 decimal places
  #
  # A DataFrame represents a tabular, spreadsheet-like data structure containing
  # an ordered collection of columns.
  # As you notice, there is an intuitive relation between DataFrame and TSV
  # files.
  #
  # Use the pandas lib to read a TSV file and generate descriptive statistics
  # without having to reinvent the wheel.
  #
  # Hints:
  # 1. pandas.read_table can read a TSV file and return a DataFrame
  # 2. be careful that you should not read the first record as the header
  # 3. figure out the method of pandas.DataFrame which can calculate
  # descriptive statistics
  # 4. figure out the parameter of pandas.DataFrame.to_csv to format the
  # floating point numbers.
  #
  # You can solve the problem by filling in the correct method or parameters
  # to the following Python code:
  #
  # <import libs on your own>
  # df = pandas.read_table('output', index_col=1, encoding='utf-8', <params>)
  # df.<method>.to_csv(sys.stdout, encoding='utf-8', <params>)
  #
  # `index_col=1` means the second column of the input file, i.e. `page_title`
  # will be read as indexes.
  #
  # Standard output format:
  # count,6130.00,6130.00,...,6130.00
  # mean,235884.75,6356.04,...,7256.01
  # std,433129.27,13291.35,...,19867.40
  # min,100005.00,0.00,...,22.00
  # 25%,118625.25,3089.00,...,3395.25
  # 50%,151206.50,4221.00,...,4616.50
  # 75%,228864.00,6332.75,...,6835.00
  # max,17838479.00,438908.00,...,1169461.00
  :
}

q2() {
  # (Honors task) introduction to Series
  #
  # Question:
  # Print the 95th percentile of daily pageviews on Nov 9.
  #
  # The other basic Data Structure in pandas is Series.
  #
  # 1. a DataFrame can be thought of as a dictionary of Series:
  # each column is a Series and all the Series share the same index.
  # 2. Series is a one-dimensional array-like object containing an array of data
  # and an associated array of data labels, called its index.
  #
  # The workflow to solve this question can be as follows:
  # 1. read the file as a DataFrame
  # 2. select the column of Nov 9
  # 3. calculate the 95th percentile
  #
  # In this question, you will explore one of the approaches to get a column
  # from a DataFrame: integer position based selection (iloc).
  #
  # Read the "Selection By Position" section at:
  # https://pandas.pydata.org/pandas-docs/stable/indexing.html
  #
  # You can solve the problem by filling in the correct positions or parameters
  # to the following Python code:
  #
  # <import libs on your own>
  # df = pandas.read_table('output', index_col=1, encoding='utf-8', <params>)
  # print("%.2f" % df.iloc[<positions>].<method>)
  #
  # Standard output format:
  # <a decimal value with 2 decimal places>
  :
}

q3() {
  # (Honors task) the essence of a Series is a dictionary
  #
  # Question:
  # Print the first 5 records in the output file along with monthly pageviews
  # as a JSON object.
  # The keys are the articles and the values are the pageviews.
  #
  # A series can be thought of a dictionary/map in essence.
  # A dictionary/map can be represented as a JSON object, which is an unordered
  # set of name/value pairs.
  #
  # Hint:
  # 1. use pandas.DataFrame.iloc[pos] to select the monthly_view column as a
  # Series
  # 2. the naming of the method which converts a pandas.Series to JSON is
  # self-explanatory
  # 3. to select the first 5 records, you can choose one of the following
  # approaches:
  # 3.1. use a method of pandas.Series similar to GNU tool `head` which
  # returns the first n rows
  # 3.2. use pandas.Series.iloc[pos]
  #
  # Standard output format:
  # {"Donald_Trump":17838479,"AMGTV":10521834,"Proyecto_40":9321993,
  # "United_States_presidential_election,_2016":9016683,"Melania_Trump":6443631}
  :
}

q4() {
  # (Honors task) rank DataFrame and break ties
  #
  # Question:
  # `senators.txt` has a list of articles about United States senators and
  # senators-elect. Please rank the senators and senators-elect by the
  # monthly pageviews. Print the ranked result as a JSON object.
  #
  # The keys of the JSON object are the articles, and the values are the
  # rankings (not pageviews).
  #
  # If the name of a senator or senator-elect does not exist in `output`, do
  # not include it in the ranking.
  #
  # Use "dense ranking" to deal with ties, a.k.a "1223" ranking.
  # If A ranks ahead of B and C (which compare equal) and both ranked ahead of
  # D, then A gets ranking number 1 ("first"), B gets ranking number 2
  # ("joint second"), C also gets ranking number 2 ("joint second") and D gets
  # ranking number 3 ("third").
  # Reference: https://en.wikipedia.org/wiki/Ranking
  #
  # There is no need to (re)implement your own ranking algorithm.
  # pandas.DataFrame.rank enables users to choose from different ways to
  # assign a rank to tied elements.
  #
  # One possible solution is as follows:
  #
  # <import libs on your own>
  # monthly_view = pandas.read_table('output', index_col=1, encoding='utf-8', \
  # <params>).iloc[<positions>]
  # pandas.read_table('senators.txt', index_col=0, encoding='utf-8').join( \
  # monthly_view, how='inner')
  # print(monthly_view.rank(<params>).<methods>.to_json())
  #
  # Standard output format:
  # {"Barbara_Boxer":16,"Chuck_Schumer":10,...,"Rand_Paul":17,"Ted_Cruz":9}
  :
}

################################################################################
##                    DO NOT MODIFY ANYTHING BELOW                            ##
################################################################################
declare -ar questions=("filter" "q1" "q2" "q3" "q4")
declare -ar mapred=("map" "reduce")

readonly usage="This program is used to execute your solution.
Usage:
./runner.sh to run all the questions
./runner.sh -r <question_id> to run one single question
./runner.sh -s to run setup() function
./runner.sh -c to run cleanup() function
Example:
./runner.sh -r q1 to run q1"

contains() {
  local e
  for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
  return 1
}

while getopts ":hr:sc" opt; do
  case ${opt} in
    h)
      echo "$usage" >&2
      exit
    ;;
    s)
      setup
      echo "setup() function executed" >&2
      exit
    ;;
    c)
      cleanup
      echo "cleanup() function executed" >&2
      exit
    ;;
    r)
      question=$OPTARG
      if contains "$question" "${questions[@]}"; then
        answer=$("$question")
        echo -n "$answer"
      else
        if contains "$question" "${mapred[@]}"; then
          "$question"
        else
          echo "Invalid question id" >&2
          echo "$usage" >&2
          exit 2
        fi
      fi
      exit
    ;;
    \?)
      echo "Invalid option: -$OPTARG"  >&2
      echo "$usage" >&2
      exit 2
    ;;
  esac
done

if [ -z "$1" ]; then
  setup 1>&2
  echo "setup() function executed" >&2
  echo "The answers generated by executing your solution are: " >&2

  for question in "${questions[@]}"; do
    echo "$question:"
    result="$("$question")"
    if [[ "$result" ]]; then
      echo "$result"
    else
      echo ""
    fi
  done
  cleanup 1>&2
  echo "cleanup() function executed" >&2
  echo "If you feel these values are correct please run:"  >&2
  echo "./submitter"  >&2
else
  echo "Invalid usage" >&2
  echo "$usage" >&2
  exit 2
fi