cd /depot/statclass/data/dataexpo2009 1a. basic solution: cat 2005.csv | awk -F, '{arrdelay += $15; flightcount++} END{print arrdelay/flightcount}' advanced solution: cat 2005.csv | grep -v ArrDelay | awk -F, '{if ($15!="NA") {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' basic solution: 7.03274; advanced solution: 7.18134 1b. basic solution: cat 2005.csv | awk -F, '{depdelay += $16; flightcount++} END{print depdelay/flightcount}' advanced solution: cat 2005.csv | grep -v DepDelay | awk -F, '{if ($16!="NA") {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' basic solution: 8.51186; advanced solution: 8.67431 1c. basic solution: cat [1-2]*.csv | awk -F, '{arrdelay += $15; flightcount++} END{print arrdelay/flightcount}' advanced solution: cat [1-2]*.csv | grep -v ArrDelay | awk -F, '{if ($15!="NA") {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' basic solution: 6.90229; advanced solution: 7.04996 1d. basic solution: cat [1-2]*.csv | awk -F, '{depdelay += $16; flightcount++} END{print depdelay/flightcount}' advanced solution: cat [1-2]*.csv | grep -v DepDelay | awk -F, '{if ($16!="NA") {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' basic solution: 8.01844; advanced solution: 8.17071 2a. basic solution: cat 2005.csv | awk -F, '{if (($4 == 6) || ($4 == 7)) {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' advanced solution: cat 2005.csv | grep -v ArrDelay | awk -F, '{if ((($4 == 6) || ($4 == 7)) && ($15!="NA")) {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' basic solution: 4.84079; advanced solution: 4.93905 2b. basic solution: cat 2005.csv | awk -F, '{if (($4 == 6) || ($4 == 7)) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' advanced solution: cat 2005.csv | grep -v DepDelay | awk -F, '{if ((($4 == 6) || ($4 == 7)) && ($16!="NA")) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' basic solution: 7.51425; advanced solution: 7.65198 2c. basic solution: cat [1-2]*.csv | awk -F, '{if (($4 == 6) || ($4 == 7)) {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' advanced solution: cat [1-2]*.csv | grep -v ArrDelay | awk -F, '{if ((($4 == 6) || ($4 == 7)) && ($15!="NA")) {arrdelay += $15; flightcount++}} END{print arrdelay/flightcount}' basic solution: 5.30331; advanced solution: 5.4005 2d. basic solution: cat [1-2]*.csv | awk -F, '{if (($4 == 6) || ($4 == 7)) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' advanced solution: cat [1-2]*.csv | grep -v DepDelay | awk -F, '{if ((($4 == 6) || ($4 == 7)) && ($16!="NA")) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' basic solution: 7.55609; advanced solution: 7.677 3a. basic solution: cat [1-2]*.csv | awk -F, '{if (($17 == "IND") && ($18 == "ORD")) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' advanced solution: cat [1-2]*.csv | grep -v DepDelay | awk -F, '{if ((($17 == "IND") && ($18 == "ORD")) && ($16!="NA")) {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' basic solution: 8.85475; advanced solution: 9.13213 3b. basic solution: cat ~/INDtoORD.txt | awk -F, '{depdelay += $16; flightcount++} END{print depdelay/flightcount}' advanced solution: cat ~/INDtoORD.txt | grep -v DepDelay | awk -F, '{if ($16!="NA") {depdelay += $16; flightcount++}} END{print depdelay/flightcount}' cd /depot/statclass/data/taxi2018 4a. basic solution: cat yellow_tripdata_2017-06.csv | awk -F, '{distance += $5; taxicount++} END{print distance/taxicount}' advanced solution: cat yellow_tripdata_2017-06.csv | grep -v "istance" | awk -F, '{if (NF >= 3) {distance += $5; taxicount++}} END{print distance/taxicount}' solution: 2.97862 4b. basic solution: nohup cat yellow*.csv | awk -F, '{distance += $5; taxicount++} END{print distance/taxicount}' >~/newnewresult4a.txt & advanced solution: nohup cat yellow*.csv | grep -v "istance" | awk -F, '{if (NF >= 3) {distance += $5; taxicount++}} END{print distance/taxicount}' >~/newnewresult4b.txt & solution: 5.12591 *********************** 5. for year in {2009..2014}; do for month in {01..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v olls | awk -F, '{if((NF > 3)&&($18!=0)) {percentage += $17/$18; counter++}} END{print percentage/counter, counter}' >~/taxitolls${year}-month${month}result.txt & done done for year in {2015..2015}; do for month in {01..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v olls | awk -F, '{if((NF > 3)&&($19!=0)) {percentage += $17/$19; counter++}} END{print percentage/counter, counter}' >~/taxitolls${year}-month${month}result.txt & done done for year in {2016..2016}; do for month in {01..06}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v olls | awk -F, '{if((NF > 3)&&($19!=0)) {percentage += $17/$19; counter++}} END{print percentage/counter, counter}' >~/taxitolls${year}-month${month}result.txt & done done for year in {2016..2016}; do for month in {07..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v olls | awk -F, '{if((NF > 3)&&($17!=0)) {percentage += $15/$17; counter++}} END{print percentage/counter, counter}' >~/taxitolls${year}-month${month}result.txt & done done for year in {2017..2017}; do for month in {01..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v olls | awk -F, '{if((NF > 3)&&($17!=0)) {percentage += $15/$17; counter++}} END{print percentage/counter, counter}' >~/taxitolls${year}-month${month}result.txt & done done # once all of those are done running, # then tabulate the data cat ~/taxitolls20*.txt | awk '{total += $1*$2; counter += $2} END{print total/counter}' solution: 0.00542058 6. for year in {2009..2015}; do for month in {01..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v ayment | awk -F, '{if(NF > 3) {totaldistance[$12] += $5; counter[$12]++;}} END{for (key in totaldistance) {print key, totaldistance[key], counter[key];}}' >~/taxipaymenttypes${year}-month${month}result.txt & done done for year in {2016..2016}; do for month in {01..06}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v ayment | awk -F, '{if(NF > 3) {totaldistance[$12] += $5; counter[$12]++;}} END{for (key in totaldistance) {print key, totaldistance[key], counter[key];}}' >~/taxipaymenttypes${year}-month${month}result.txt & done done for year in {2016..2016}; do for month in {07..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v ayment | awk -F, '{if(NF > 3) {totaldistance[$10] += $5; counter[$10]++;}} END{for (key in totaldistance) {print key, totaldistance[key], counter[key];}}' >~/taxipaymenttypes${year}-month${month}result.txt & done done for year in {2017..2017}; do for month in {01..12}; do nohup cat yellow_tripdata_${year}-${month}.csv | grep -v ayment | awk -F, '{if(NF > 3) {totaldistance[$10] += $5; counter[$10]++;}} END{for (key in totaldistance) {print key, totaldistance[key], counter[key];}}' >~/taxipaymenttypes${year}-month${month}result.txt & done done cat ~/taxipaymenttypes*.txt | awk '{if(!(($1 > 40)&&($1 < 42))) {print $0}}' | awk '{totaldistance[$1] += $2; totalcount[$1] += $3;} END{for (key in totaldistance) {print key, totaldistance[key], totalcount[key], totaldistance[key]/totalcount[key];}}' 7a. cd /depot/statclass/data/dataexpo2009 cat 2005.csv | grep -v UniqueCarrier | awk -F, '{distance[$9] += $19} END{for (key in distance) {print key, distance[key]}}' AA 722852274 DL 564268170 XE 213291464 MQ 211059565 OO 195573943 CO 334136379 B6 150758132 FL 127526045 AS 138031164 TZ 47556475 HA 28642799 UA 532116122 OH 170772547 HP 200373825 NW 365044151 WN 628492880 F9 48562429 DH 60671620 EV 139250724 US 288955635 7b. cat [1-2]*.csv | grep -v UniqueCarrier | awk -F, '{distance[$9] += $19} END{for (key in distance) {print key, distance[key]}}' AA 14237240059 XE 1261704518 DL 11782682821 OO 1199143412 MQ 1446828218 PA (1) 213910356 TW 2733374003 B6 970096179 FL 843208347 AQ 52022302 CO 7290881290 ML (1) 47795815 EA 557435834 TZ 239451257 AS 2138434915 YV 339860468 HA 161922365 UA 12185717876 PS 30274790 OH 687290174 9E 235073027 NW 7301968497 HP 2735172637 WN 8085268722 F9 299595575 DH 259805885 PI 331802193 EV 764868753 US 8109732855 8a. # Notice that we choose to print the distances and then the tailnums # and we sort by the distances, and we are only printing the tail cat 2005.csv | grep -v UniqueCarrier | awk -F, '{distance[$11] += $19} END{for (key in distance) {print distance[key], key}}' | sort -n | tail 2034981 N211UA 2036892 N588JB 2039066 N590JB 2045220 N589JB 2045625 N593JB 2050927 N213UA 2060375 N598JB 2069280 N550JB 7597848 000000 37522198 0 8b. cat [1-2]*.csv | grep -v UniqueCarrier | awk -F, '{distance[$11] += $19} END{for (key in distance) {print distance[key], key}}' | sort -n | tail 23882022 N550UA 23886947 N552UA 23893505 N543UA 24073793 N551UA 48624558 000000 81199937 119055508 �NKNO 184998252 0 387812613 UNKNOW 23723559710 NA 9. cd /depot/statclass/data/taxi2018 9abc (June 2017 only). basic solution: cat yellow_tripdata_2017-06.csv | awk -F, '{distance[$4] += $5; taxicount[$4]++} END{ for (key in distance) {print key, distance[key]/taxicount[key]}}' | sort -n 0 0 0.417445 passenger_count 0 1 2.92293 2 3.16181 3 3.09691 4 3.19115 5 3.03522 6 3.04735 7 3.54154 8 5.56933 9 5.4615 advanced solution: cat yellow_tripdata_2017-06.csv | grep -v "istance" | awk -F, '{if (NF >= 3) {distance[$4] += $5; taxicount[$4]++}} END{ for (key in distance) {print key, distance[key]/taxicount[key]}}' | sort -n 0 0.417445 1 2.92293 2 3.16181 3 3.09691 4 3.19115 5 3.03522 6 3.04735 7 3.54154 8 5.56933 9 5.4615 9abc (all years). basic solution: nohup cat yellow*.csv | awk -F, '{distance[$4] += $5; taxicount[$4]++} END{ for (key in distance) {print key, distance[key]/taxicount[key]}}' | sort -n >~/newnewresult9a.txt & 0 0 2.30545 passenger_count 0 passenger_count 0 Passenger_Count 0 1 5.52209 2 4.91924 3 4.88274 4 4.25553 5 2.74503 6 2.9605 7 3.30694 8 4.75099 9 5.90511 10 0.318235 13 12.89 15 2.21 17 9.18 19 0.69 25 0.87 33 1.615 34 3.57 36 20.16 37 2.16 38 1.45 47 2.56 49 0 51 2.65 53 1.86 58 5.815 61 8.78 65 7.51333 66 4.5 69 1.28 70 3.06 84 13.98 91 9.44 97 1.87 113 0 125 3.83 129 1.59 133 3.05 134 20.92 137 17.64 141 3.81 155 16.53 158 1.57 160 1.48 163 3.03 164 22.98 165 1.44 169 14.76 177 1.34 192 1.07 193 1.74 208 0.0582838 211 0.97 213 0 223 1.16 225 4.83 229 3.27 232 722862 247 3.31 249 1.69 250 3.64333 254 1.02 255 2.632 advanced solution: nohup cat yellow*.csv | grep -v "istance" | awk -F, '{if (NF >= 3) {distance[$4] += $5; taxicount[$4]++}} END{ for (key in distance) {print key, distance[key]/taxicount[key]}}' | sort -n >~/newnewresult9b.txt & 0 2.30545 1 5.52209 2 4.91924 3 4.88274 4 4.25553 5 2.74503 6 2.9605 7 3.30694 8 4.75099 9 5.90511 10 0.318235 13 12.89 15 2.21 17 9.18 19 0.69 25 0.87 33 1.615 34 3.57 36 20.16 37 2.16 38 1.45 47 2.56 49 0 51 2.65 53 1.86 58 5.815 61 8.78 65 7.51333 66 4.5 69 1.28 70 3.06 84 13.98 91 9.44 97 1.87 113 0 125 3.83 129 1.59 133 3.05 134 20.92 137 17.64 141 3.81 155 16.53 158 1.57 160 1.48 163 3.03 164 22.98 165 1.44 169 14.76 177 1.34 192 1.07 193 1.74 208 0.0582838 211 0.97 213 0 223 1.16 225 4.83 229 3.27 232 722862 247 3.31 249 1.69 250 3.64333 254 1.02 255 2.632 10. cat yellow_tripdata_2017-06.csv | awk -F[,\ ] '{print $2}' | sort -n | uniq -c 1 1 tpep_pickup_datetime 344507 2017-06-01 347404 2017-06-02 341807 2017-06-03 294236 2017-06-04 304042 2017-06-05 341499 2017-06-06 339808 2017-06-07 353452 2017-06-08 342240 2017-06-09 337959 2017-06-10 283088 2017-06-11 311495 2017-06-12 333931 2017-06-13 349305 2017-06-14 347838 2017-06-15 341823 2017-06-16 318478 2017-06-17 277743 2017-06-18 306068 2017-06-19 318727 2017-06-20 331000 2017-06-21 338890 2017-06-22 341160 2017-06-23 317617 2017-06-24 248929 2017-06-25 290740 2017-06-26 321083 2017-06-27 316000 2017-06-28 313277 2017-06-29 302847 2017-06-30