-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzapocet.txt
130 lines (97 loc) · 3.04 KB
/
zapocet.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//*--------------------
Martin Rypar
BDT
Zapocet, 9.1.2019
--------------------*//
1. HDFS
hdfs dfs -chmod 755 /user/ryparmar
hdfs dfs -mkdir chess_ratings
hdfs dfs -chmod 755 /user/ryparmar/chess_ratings
hdfs dfs -put /home/pascepet/fel_bigdata/data/chess_ratings/standard_????.csv /user/ryparmar/chess_ratings
2. HIVE
CREATE EXTERNAL TABLE ratings_ext (
id int,
name string,
state char(3),
sex char(1),
rating int,
games int,
born_year int,
listed_year int,
listed_month int
)
ROW FORMAT
DELIMITED FIELDS TERMINATED BY '|'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION "/user/pascepet/data/chess_ratings_red";
CREATE TABLE ratings (
id int,
name string,
state char(3),
sex char(1),
rating int,
games int,
born_year int,
listed_year int,
listed_month int
)
STORED AS ORC
TBLPROPERTIES("orc.compress"="ZLIB");
INSERT INTO TABLE ratings
SELECT
id,
name,
state,
sex,
rating,
games,
born_year,
listed_year,
listed_month
FROM ratings_ext
WHERE name is not NULL AND born_year > 0 AND rating is not NULL;
//* DODELANO EXPOST *//
SELECT ratings.id, ratings.name, max(ratings.rating)-min(ratings.rating) AS diff FROM ratings GROUP BY ratings.id, ratings.name ORDER BY diff DESC LIMIT 100;
//* JESTE JEDEN DOTAZ - nepamatuji si *//
3. SPARK RDD
dd = sc.textFile("/user/pascepet/data/chess_ratings_red/")
dd1 = dd.map(lambda line: line.split("|"))
def parse(line):
if line[1] != "":
return line[0] + line[1]
else: return ""
dd2 = dd1.map(lambda line: parse(line))
dd2 = dd2.filter(lambda line: line != "")
dd2.distinct().count()
dd3 = dd1.map(lambda line: len(line[1].split(",")))
dd3 = dd3.map(lambda line: [line, 1])
dd4 = dd3.reduceByKey(lambda x,y: x+y)
def parse2(line):
if line[1] != "":
tmp = line[1].split(",")
ret = 0
for i in tmp:
if i.replace(" ", "") == "Mustafa":
ret = ret + 1
else: ret = 0
return ret
dd5 = dd1.map(lambda line: parse2(line)).sum()
4. SPARK SQL
from pyspark.sql.types import *
from pyspark.sql import functions as F
schema = StructType([
StructField("id", LongType()),
StructField("name", StringType()),
StructField("state", StringType()),
StructField("sex", StringType()),
StructField("rating", IntegerType()),
StructField("games", IntegerType()),
StructField("born_year", IntegerType()),
StructField("listed_year", IntegerType()),
StructField("listed_month", IntegerType())
])
df = sqlContext.read.format("com.databricks.spark.csv").option("header", "false").option("delimiter", '|').schema(schema).load("/user/pascepet/data/chess_ratings_red/*.csv")
df.filter((df.sex == "F") & (df.state == "CZE" ) & (df.rating > 2300)).select(df.name).distinct().show()
df.groupBy(df.name).agg({'games': 'count'}).toDF('name', 'games').filter("name != ''").orderBy('games', ascending=False).take(3)
df.filter( ((df.listed_year == 2017) & (df.listed_month == 12)) | ((df.listed_year == 2018) & (df.listed_month == 12)) ).groupBy(df.name).agg({'rating': 'avg'}).toDF('name', 'avg_rating').show()